LCOV - code coverage report
Current view: top level - include/xapian - cluster.h (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 7822d31adece Lines: 16 18 88.9 %
Date: 2019-05-23 11:15:29 Functions: 18 32 56.2 %
Branches: 12 32 37.5 %

           Branch data     Line data    Source code
       1                 :            : /** @file cluster.h
       2                 :            :  *  @brief Cluster API
       3                 :            :  */
       4                 :            : /* Copyright (C) 2010 Richard Boulton
       5                 :            :  * Copyright (C) 2016 Richhiey Thomas
       6                 :            :  * Copyright (C) 2018 Uppinder Chugh
       7                 :            :  *
       8                 :            :  * This program is free software; you can redistribute it and/or
       9                 :            :  * modify it under the terms of the GNU General Public License as
      10                 :            :  * published by the Free Software Foundation; either version 2 of the
      11                 :            :  * License, or (at your option) any later version.
      12                 :            :  *
      13                 :            :  * This program is distributed in the hope that it will be useful,
      14                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      15                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      16                 :            :  * GNU General Public License for more details.
      17                 :            :  *
      18                 :            :  * You should have received a copy of the GNU General Public License
      19                 :            :  * along with this program; if not, write to the Free Software
      20                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
      21                 :            :  * USA
      22                 :            :  */
      23                 :            : 
      24                 :            : #ifndef XAPIAN_INCLUDED_CLUSTER_H
      25                 :            : #define XAPIAN_INCLUDED_CLUSTER_H
      26                 :            : 
      27                 :            : #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
      28                 :            : #error "Never use <xapian/cluster.h> directly; include <xapian.h> instead."
      29                 :            : #endif
      30                 :            : 
      31                 :            : #include <xapian/attributes.h>
      32                 :            : #include <xapian/mset.h>
      33                 :            : #include <xapian/queryparser.h>
      34                 :            : #include <xapian/types.h>
      35                 :            : #include <xapian/visibility.h>
      36                 :            : 
      37                 :            : #include <unordered_map>
      38                 :            : #include <unordered_set>
      39                 :            : #include <vector>
      40                 :            : 
      41                 :            : namespace Xapian {
      42                 :            : 
      43                 :            : /// Stopper subclass which checks for both stemmed and unstemmed stopwords
      44         [ -  + ]:         56 : class XAPIAN_VISIBILITY_DEFAULT StemStopper : public Xapian::Stopper {
      45                 :            :   public:
      46                 :            :     /// Stemming strategies
      47                 :            :     typedef enum {
      48                 :            :         STEM_NONE, STEM_SOME, STEM_ALL, STEM_ALL_Z, STEM_SOME_FULL_POS
      49                 :            :     } stem_strategy;
      50                 :            : 
      51                 :            :     /** Constructor
      52                 :            :      *
      53                 :            :      *  @param stemmer  The Xapian::Stem object to set.
      54                 :            :      *  @param strategy The stemming strategy to be used.
      55                 :            :      */
      56                 :            :     explicit StemStopper(const Xapian::Stem &stemmer, stem_strategy strategy = STEM_SOME);
      57                 :            : 
      58                 :            :     std::string get_description() const;
      59                 :            : 
      60                 :         84 :     bool operator()(const std::string & term) const {
      61         [ +  - ]:         84 :         return stop_words.find(term) != stop_words.end();
      62                 :            :     }
      63                 :            : 
      64                 :            :     /// Add a single stop word and its stemmed equivalent
      65                 :            :     void add(const std::string &term);
      66                 :            : 
      67                 :            :   private:
      68                 :            :     stem_strategy stem_action;
      69                 :            :     std::unordered_set<std::string> stop_words;
      70                 :            :     Xapian::Stem stemmer;
      71                 :            : };
      72                 :            : 
      73                 :            : /** Class representing a set of documents in a cluster
      74                 :            :  */
      75                 :          6 : class XAPIAN_VISIBILITY_DEFAULT DocumentSet {
      76                 :            :   public:
      77                 :            :     class Internal;
      78                 :            :     /// @private @internal Reference counted internals.
      79                 :            :     Xapian::Internal::intrusive_ptr_nonnull<Internal> internal;
      80                 :            : 
      81                 :            :     /** Copying is allowed.  The internals are reference counted, so
      82                 :            :      *  copying is cheap.
      83                 :            :      *
      84                 :            :      *  @param other    The object to copy.
      85                 :            :      */
      86                 :            :     DocumentSet(const DocumentSet &other);
      87                 :            : 
      88                 :            :     /** Assignment is allowed.  The internals are reference counted,
      89                 :            :      *  so assignment is cheap.
      90                 :            :      *
      91                 :            :      *  @param other    The object to copy.
      92                 :            :      */
      93                 :            :     DocumentSet & operator=(const DocumentSet &other);
      94                 :            : 
      95                 :            :     /** Move constructor.
      96                 :            :      *
      97                 :            :      * @param other     The object to move.
      98                 :            :      */
      99                 :            :     DocumentSet(DocumentSet && other);
     100                 :            : 
     101                 :            :     /** Move assignment operator.
     102                 :            :      *
     103                 :            :      * @param other     The object to move.
     104                 :            :      */
     105                 :            :     DocumentSet & operator=(DocumentSet && other);
     106                 :            : 
     107                 :            :     /// Default constructor
     108                 :            :     DocumentSet();
     109                 :            : 
     110                 :            :     /// Destructor
     111                 :            :     ~DocumentSet();
     112                 :            : 
     113                 :            :     /// Return the size of the DocumentSet
     114                 :            :     Xapian::doccount size() const;
     115                 :            : 
     116                 :            :     /// Return the Document in the DocumentSet at index i
     117                 :            :     Xapian::Document& operator[](Xapian::doccount i);
     118                 :            : 
     119                 :            :     /// Return the Document in the DocumentSet at index i
     120                 :            :     const Xapian::Document& operator[](Xapian::doccount i) const;
     121                 :            : 
     122                 :            :     /** Add a new Document to the DocumentSet
     123                 :            :      *
     124                 :            :      *  @param document         Document object that is to be added to
     125                 :            :      *                          the DocumentSet
     126                 :            :      */
     127                 :            :     void add_document(const Document &document);
     128                 :            : };
     129                 :            : 
     130                 :            : /** Base class for TermListGroup
     131                 :            :  *  Stores and provides terms that are contained in a document and
     132                 :            :  *  their respective term frequencies
     133                 :            :  */
     134                 :            : class XAPIAN_VISIBILITY_DEFAULT FreqSource
     135                 :            :     : public Xapian::Internal::opt_intrusive_base {
     136                 :            :     /// Don't allow assignment.
     137                 :            :     void operator=(const FreqSource &) = delete;
     138                 :            : 
     139                 :            :     /// Don't allow copying.
     140                 :            :     FreqSource(const FreqSource &) = delete;
     141                 :            : 
     142                 :            :   public:
     143                 :            :     /// Default constructor
     144                 :         30 :     FreqSource() {}
     145                 :            : 
     146                 :            :     /// Destructor
     147                 :            :     virtual ~FreqSource();
     148                 :            : 
     149                 :            :     /** Return the term frequency of a particular term 'tname'
     150                 :            :      *
     151                 :            :      *  @param tname    The term for which to return the term frequency
     152                 :            :      */
     153                 :            :     virtual doccount get_termfreq(const std::string &tname) const = 0;
     154                 :            : 
     155                 :            :     /// Return the number of documents within the MSet
     156                 :            :     virtual doccount get_doccount() const = 0;
     157                 :            : 
     158                 :            :     /** Start reference counting this object.
     159                 :            :      *
     160                 :            :      *  You can hand ownership of a dynamically allocated FreqSource
     161                 :            :      *  object to Xapian by calling release() and then passing the object to a
     162                 :            :      *  Xapian method.  Xapian will arrange to delete the object once it is no
     163                 :            :      *  longer required.
     164                 :            :      */
     165                 :            :     FreqSource * release() {
     166                 :            :         opt_intrusive_base::release();
     167                 :            :         return this;
     168                 :            :     }
     169                 :            : 
     170                 :            :     /** Start reference counting this object.
     171                 :            :      *
     172                 :            :      *  You can hand ownership of a dynamically allocated FreqSource
     173                 :            :      *  object to Xapian by calling release() and then passing the object to a
     174                 :            :      *  Xapian method.  Xapian will arrange to delete the object once it is no
     175                 :            :      *  longer required.
     176                 :            :      */
     177                 :            :     const FreqSource * release() const {
     178                 :            :         opt_intrusive_base::release();
     179                 :            :         return this;
     180                 :            :     }
     181                 :            : };
     182                 :            : 
     183                 :            : /** A class for dummy frequency source for construction of termlists
     184                 :            :  *  This returns 1 as the term frequency for any term
     185                 :            :  */
     186         [ #  # ]:          0 : class XAPIAN_VISIBILITY_DEFAULT DummyFreqSource : public FreqSource {
     187                 :            :   public:
     188                 :            :     /// Return the value 1 as a dummy term frequency
     189                 :            :     doccount get_termfreq(const std::string &) const;
     190                 :            : 
     191                 :            :     doccount get_doccount() const;
     192                 :            : };
     193                 :            : 
     194                 :            : /** A class for construction of termlists which store the terms for a
     195                 :            :  *  document along with the number of documents it indexes i.e. term
     196                 :            :  *  frequency
     197                 :            :  */
     198         [ -  + ]:         30 : class XAPIAN_VISIBILITY_DEFAULT TermListGroup : public FreqSource {
     199                 :            :     /** Map of the terms and its corresponding term frequencies.
     200                 :            :      *  The term frequency of a term stands for the number of documents it indexes
     201                 :            :      */
     202                 :            :     std::unordered_map<std::string, doccount> termfreq;
     203                 :            : 
     204                 :            :     /// Number of documents added to the termlist
     205                 :            :     doccount num_of_documents;
     206                 :            : 
     207                 :            :     /** Add a single document and calculates its corresponding term frequencies
     208                 :            :      *
     209                 :            :      *  @param document         Adds a document and updates the TermListGroup
     210                 :            :      *                          based on the terms found in the document
     211                 :            :      *  @param stopper          Xapian::Stopper object to identify stopwords
     212                 :            :      */
     213                 :            :     void add_document(const Document &document, const Stopper *stopper = NULL);
     214                 :            : 
     215                 :            :   public:
     216                 :            :     /** Constructor
     217                 :            :      *
     218                 :            :      *  @param docs     MSet object used to construct the TermListGroup
     219                 :            :      *  @param stopper  Xapian::Stopper object to identify stopwords
     220                 :            :      */
     221                 :            :     explicit TermListGroup(const MSet &docs, const Stopper *stopper = NULL);
     222                 :            : 
     223                 :            :     /** Return the number of documents that the term 'tname' exists in
     224                 :            :      *
     225                 :            :      *  @param tname    The term for which to return the term frequency
     226                 :            :      */
     227                 :            :     doccount get_termfreq(const std::string &tname) const;
     228                 :            : 
     229                 :            :     doccount get_doccount() const;
     230                 :            : };
     231                 :            : 
     232                 :            : /** Abstract class representing a point in the VSM
     233                 :            :  */
     234 [ +  - ][ -  + ]:        969 : class XAPIAN_VISIBILITY_DEFAULT PointType
         [ #  # ][ #  # ]
                 [ #  # ]
     235                 :            :     : public Xapian::Internal::opt_intrusive_base {
     236                 :            :   protected:
     237                 :            :     /** Implement a map to store the terms within a document
     238                 :            :      *  and their pre-computed TF-IDF weights
     239                 :            :      */
     240                 :            :     std::unordered_map<std::string, double> weights;
     241                 :            : 
     242                 :            :     /// Store the squared magnitude of the PointType
     243                 :            :     double magnitude;
     244                 :            : 
     245                 :            :     /** Set the weight 'weight' to the mapping of a term
     246                 :            :      *
     247                 :            :      *  @param term     Term for which the weight is supposed
     248                 :            :      *                  to be changed
     249                 :            :      *  @param weight   The weight to which the mapping of the
     250                 :            :      *                  term is to be set
     251                 :            :      */
     252                 :            :     void set_weight(const std::string &term, double weight);
     253                 :            : 
     254                 :            :   public:
     255                 :            :     /// Default constructor
     256         [ +  - ]:        114 :     PointType() : magnitude(0.0) {}
     257                 :            : 
     258                 :            :     /// Return a TermIterator to the beginning of the termlist
     259                 :            :     TermIterator termlist_begin() const;
     260                 :            : 
     261                 :            :     /// Return a TermIterator to the end of the termlist
     262                 :         24 :     TermIterator XAPIAN_NOTHROW(termlist_end() const) {
     263                 :         24 :         return TermIterator(NULL);
     264                 :            :     }
     265                 :            : 
     266                 :            :     /** Validate whether a certain term exists in the termlist
     267                 :            :      *  or not by performing a lookup operation in the existing values
     268                 :            :      *
     269                 :            :      *  @param term     Term which is to be searched
     270                 :            :      */
     271                 :            :     bool contains(const std::string &term) const;
     272                 :            : 
     273                 :            :     /** Return the TF-IDF weight associated with a certain term
     274                 :            :      *
     275                 :            :      *  @param term     Term for which TF-IDF weight is returned
     276                 :            :      */
     277                 :            :     double get_weight(const std::string &term) const;
     278                 :            : 
     279                 :            :     /** Add the weight 'weight' to the mapping of a term
     280                 :            :      *
     281                 :            :      *  @param term     Term to which the weight is to be added
     282                 :            :      *  @param weight   Weight which has to be added to the existing
     283                 :            :      *                  mapping of the term
     284                 :            :      */
     285                 :            :     void add_weight(const std::string &term, double weight);
     286                 :            : 
     287                 :            :     /// Return the pre-computed squared magnitude
     288                 :            :     double get_magnitude() const;
     289                 :            : 
     290                 :            :     /// Return the size of the termlist
     291                 :            :     Xapian::termcount termlist_size() const;
     292                 :            : 
     293                 :            :     /** Start reference counting this object.
     294                 :            :      *
     295                 :            :      *  You can hand ownership of a dynamically allocated PointType
     296                 :            :      *  object to Xapian by calling release() and then passing the object to a
     297                 :            :      *  Xapian method.  Xapian will arrange to delete the object once it is no
     298                 :            :      *  longer required.
     299                 :            :      */
     300                 :            :     PointType * release() {
     301                 :            :         opt_intrusive_base::release();
     302                 :            :         return this;
     303                 :            :     }
     304                 :            : 
     305                 :            :     /** Start reference counting this object.
     306                 :            :      *
     307                 :            :      *  You can hand ownership of a dynamically allocated PointType
     308                 :            :      *  object to Xapian by calling release() and then passing the object to a
     309                 :            :      *  Xapian method.  Xapian will arrange to delete the object once it is no
     310                 :            :      *  longer required.
     311                 :            :      */
     312                 :            :     const PointType * release() const {
     313                 :            :         opt_intrusive_base::release();
     314                 :            :         return this;
     315                 :            :     }
     316                 :            : };
     317                 :            : 
     318                 :            : /** Class to represent a document as a point in the Vector Space
     319                 :            :  *  Model
     320                 :            :  */
     321 [ -  + ][ +  - ]:        828 : class XAPIAN_VISIBILITY_DEFAULT Point : public PointType {
                 [ +  - ]
     322                 :            :     /// The document which is being represented by the Point
     323                 :            :     Document document;
     324                 :            : 
     325                 :            :   public:
     326                 :            :     /** Constructor
     327                 :            :      *  Initialise the point with terms and corresponding TF-IDF weights
     328                 :            :      *
     329                 :            :      *  @param tlg              TermListGroup object which provides the term
     330                 :            :      *                          frequencies.  It is used for TF-IDF weight
     331                 :            :      *                          calculations
     332                 :            :      *  @param document         The Document object over which the Point object
     333                 :            :      *                          will be initialised
     334                 :            :      */
     335                 :            :     Point(const TermListGroup &tlg, const Document &document);
     336                 :            : 
     337                 :            :     /// Returns the document corresponding to this Point
     338                 :            :     Document get_document() const;
     339                 :            : };
     340                 :            : 
     341                 :            : /** Class to represent cluster centroids in the vector space
     342                 :            : */
     343         [ -  + ]:         66 : class XAPIAN_VISIBILITY_DEFAULT Centroid : public PointType {
     344                 :            :   public:
     345                 :            :     /// Default constructor
     346                 :            :     Centroid();
     347                 :            : 
     348                 :            :     /** Constructor with Point argument
     349                 :            :      *
     350                 :            :      *  @param point    Point object to which Centroid object is
     351                 :            :      *                  initialised. The document vector and the
     352                 :            :      *                  magnitude are made equal
     353                 :            :      */
     354                 :            :     explicit Centroid(const Point &point);
     355                 :            : 
     356                 :            :     /** Divide the weight of terms in the centroid by 'size' and
     357                 :            :      *  recalculate the magnitude
     358                 :            :      *
     359                 :            :      *  @param cluster_size     Value by which Centroid document vector is
     360                 :            :      *                          divided
     361                 :            :      */
     362                 :            :     void divide(double cluster_size);
     363                 :            : 
     364                 :            :     /// Clear the terms and corresponding values of the centroid
     365                 :            :     void clear();
     366                 :            : 
     367                 :            :     /// Recalculate the magnitude of the centroid
     368                 :            :     void recalc_magnitude();
     369                 :            : };
     370                 :            : 
     371                 :            : /** Class to represents a Cluster which contains Points and Centroid
     372                 :            :  *  of the Cluster
     373                 :            :  */
     374                 :        120 : class XAPIAN_VISIBILITY_DEFAULT Cluster {
     375                 :            :   public:
     376                 :            :     class Internal;
     377                 :            :     /// @private @internal Reference counted internals.
     378                 :            :     Xapian::Internal::intrusive_ptr_nonnull<Internal> internal;
     379                 :            : 
     380                 :            :     /** Copying is allowed.  The internals are reference counted, so
     381                 :            :      *  copying is cheap.
     382                 :            :      *
     383                 :            :      *  @param other    The object to copy.
     384                 :            :      */
     385                 :            :     Cluster(const Cluster &other);
     386                 :            : 
     387                 :            :     /** Assignment is allowed.  The internals are reference counted,
     388                 :            :      *  so assignment is cheap.
     389                 :            :      *
     390                 :            :      *  @param other    The object to copy.
     391                 :            :      */
     392                 :            :     Cluster& operator=(const Cluster &other);
     393                 :            : 
     394                 :            :     /** Move constructor.
     395                 :            :      *
     396                 :            :      * @param other     The object to move.
     397                 :            :      */
     398                 :            :     Cluster(Cluster && other);
     399                 :            : 
     400                 :            :     /** Move assignment operator.
     401                 :            :      *
     402                 :            :      * @param other     The object to move.
     403                 :            :      */
     404                 :            :     Cluster & operator=(Cluster && other);
     405                 :            : 
     406                 :            :     /** Constructor
     407                 :            :      *
     408                 :            :      *  @param centroid         The centroid of the cluster object is
     409                 :            :      *                          assigned to 'centroid'
     410                 :            :      */
     411                 :            :     explicit Cluster(const Centroid &centroid);
     412                 :            : 
     413                 :            :     /// Default constructor
     414                 :            :     Cluster();
     415                 :            : 
     416                 :            :     /// Destructor
     417                 :            :     ~Cluster();
     418                 :            : 
     419                 :            :     /// Return size of the cluster
     420                 :            :     Xapian::doccount size() const;
     421                 :            : 
     422                 :            :     /** Add a document to the Cluster
     423                 :            :      *
     424                 :            :      *  @param point    The Point object representing the document which
     425                 :            :      *                  needs to be added to the cluster
     426                 :            :      */
     427                 :            :     void add_point(const Point &point);
     428                 :            : 
     429                 :            :     /// Clear the cluster weights
     430                 :            :     void clear();
     431                 :            : 
     432                 :            :     /// Return the point at the given index in the cluster
     433                 :            :     Point& operator[](Xapian::doccount i);
     434                 :            : 
     435                 :            :     /// Return the point at the given index in the cluster
     436                 :            :     const Point& operator[](Xapian::doccount i) const;
     437                 :            : 
     438                 :            :     /// Return the documents that are contained within the cluster
     439                 :            :     DocumentSet get_documents() const;
     440                 :            : 
     441                 :            :     /// Return the current centroid of the cluster
     442                 :            :     const Centroid& get_centroid() const;
     443                 :            : 
     444                 :            :     /** Set the centroid of the Cluster to 'centroid'
     445                 :            :      *
     446                 :            :      *  @param centroid         Centroid object for the Cluster
     447                 :            :      */
     448                 :            :     void set_centroid(const Centroid &centroid);
     449                 :            : 
     450                 :            :     /** Recalculate the centroid of the Cluster after each iteration
     451                 :            :      *  of the KMeans algorithm by taking the mean of all document vectors (Points)
     452                 :            :      *  that belong to the Cluster
     453                 :            :      */
     454                 :            :     void recalculate();
     455                 :            : };
     456                 :            : 
     457                 :            : /** Class for storing the results returned by the Clusterer
     458                 :            :  */
     459                 :          0 : class XAPIAN_VISIBILITY_DEFAULT ClusterSet {
     460                 :            :   public:
     461                 :            :     class Internal;
     462                 :            :     /// @private @internal Reference counted internals.
     463                 :            :     Xapian::Internal::intrusive_ptr_nonnull<Internal> internal;
     464                 :            : 
     465                 :            :     /** Copying is allowed.  The internals are reference counted, so
     466                 :            :      *  copying is cheap.
     467                 :            :      *
     468                 :            :      *  @param other    The object to copy.
     469                 :            :      */
     470                 :            :     ClusterSet(const ClusterSet &other);
     471                 :            : 
     472                 :            :     /** Assignment is allowed.  The internals are reference counted,
     473                 :            :      *  so assignment is cheap.
     474                 :            :      *
     475                 :            :      *  @param other    The object to copy.
     476                 :            :      */
     477                 :            :     ClusterSet& operator=(const ClusterSet &other);
     478                 :            : 
     479                 :            :     /** Move constructor.
     480                 :            :      *
     481                 :            :      * @param other     The object to move.
     482                 :            :      */
     483                 :            :     ClusterSet(ClusterSet && other);
     484                 :            : 
     485                 :            :     /** Move assignment operator.
     486                 :            :      *
     487                 :            :      * @param other     The object to move.
     488                 :            :      */
     489                 :            :     ClusterSet & operator=(ClusterSet && other);
     490                 :            : 
     491                 :            :     /// Default constructor
     492                 :            :     ClusterSet();
     493                 :            : 
     494                 :            :     /// Destructor
     495                 :            :     ~ClusterSet();
     496                 :            : 
     497                 :            :     /** Add a cluster to the ClusterSet
     498                 :            :      *
     499                 :            :      *  @param cluster  Cluster object which is to be added to the ClusterSet
     500                 :            :      */
     501                 :            :     void add_cluster(const Cluster &cluster);
     502                 :            : 
     503                 :            :     /** Add the point to the cluster at position 'index'
     504                 :            :      *
     505                 :            :      *  @param point    Point object which needs to be added to
     506                 :            :      *                  a Cluster within the ClusterSet
     507                 :            :      *  @param index    Index of the Cluster within the ClusterSet to
     508                 :            :      *                  which the Point is to be added
     509                 :            :      */
     510                 :            :     void add_to_cluster(const Point &point, unsigned int index);
     511                 :            : 
     512                 :            :     /// Return the number of clusters
     513                 :            :     Xapian::doccount size() const;
     514                 :            : 
     515                 :            :     /// Return the cluster at index 'i'
     516                 :            :     Cluster& operator[](Xapian::doccount i);
     517                 :            : 
     518                 :            :     /// Return the cluster at index 'i'
     519                 :            :     const Cluster& operator[](Xapian::doccount i) const;
     520                 :            : 
     521                 :            :     /// Clear all the clusters in the ClusterSet
     522                 :            :     void clear_clusters();
     523                 :            : 
     524                 :            :     /** Recalculate the centroid for all the clusters in the ClusterSet */
     525                 :            :     void recalculate_centroids();
     526                 :            : };
     527                 :            : 
     528                 :            : /** Base class for calculating the similarity between documents
     529                 :            :  */
     530                 :            : class XAPIAN_VISIBILITY_DEFAULT Similarity {
     531                 :            :   public:
     532                 :            :     /// Destructor
     533                 :            :     virtual ~Similarity();
     534                 :            : 
     535                 :            :     /** Calculates the similarity between the two documents
     536                 :            :      *
     537                 :            :      *  @param a        First point object for distance calculation
     538                 :            :      *  @param b        Second point object for distance calculation
     539                 :            :      */
     540                 :            :     virtual double similarity(const PointType &a, const PointType &b) const = 0;
     541                 :            : 
     542                 :            :     /// Returns a string describing the similarity metric being used
     543                 :            :     virtual std::string get_description() const = 0;
     544                 :            : };
     545                 :            : 
     546                 :            : /** Class for calculating the cosine distance between two documents
     547                 :            :  */
     548         [ -  + ]:         24 : class XAPIAN_VISIBILITY_DEFAULT CosineDistance : public Similarity {
     549                 :            :   public:
     550                 :            :     /** Calculates and returns the cosine similarity using the
     551                 :            :      *  formula  cos(theta) = a.b/(|a|*|b|)
     552                 :            :      */
     553                 :            :     double similarity(const PointType &a, const PointType &b) const;
     554                 :            : 
     555                 :            :     /// Return a string describing this object
     556                 :            :     std::string get_description() const;
     557                 :            : };
     558                 :            : 
     559                 :            : /** Class representing an abstract class for a clusterer to be implemented
     560                 :            :  */
     561                 :         18 : class XAPIAN_VISIBILITY_DEFAULT Clusterer
     562                 :            :     : public Xapian::Internal::opt_intrusive_base {
     563                 :            :   public:
     564                 :            :     /// Destructor
     565                 :            :     virtual ~Clusterer();
     566                 :            : 
     567                 :            :     /** Implement the required clustering algorithm in the subclass and
     568                 :            :      *  and return clustered output as ClusterSet
     569                 :            :      *
     570                 :            :      *  @param mset     The MSet object which contains the documents to be
     571                 :            :      *                  clustered
     572                 :            :      */
     573                 :            :     virtual ClusterSet cluster(const MSet &mset) = 0;
     574                 :            : 
     575                 :            :     /// Returns a string describing the clusterer being used
     576                 :            :     virtual std::string get_description() const = 0;
     577                 :            : 
     578                 :            :     /** Start reference counting this object.
     579                 :            :      *
     580                 :            :      *  You can hand ownership of a dynamically allocated Clusterer
     581                 :            :      *  object to Xapian by calling release() and then passing the object to a
     582                 :            :      *  Xapian method.  Xapian will arrange to delete the object once it is no
     583                 :            :      *  longer required.
     584                 :            :      */
     585                 :            :     Clusterer * release() {
     586                 :            :         opt_intrusive_base::release();
     587                 :            :         return this;
     588                 :            :     }
     589                 :            : 
     590                 :            :     /** Start reference counting this object.
     591                 :            :      *
     592                 :            :      *  You can hand ownership of a dynamically allocated Clusterer
     593                 :            :      *  object to Xapian by calling release() and then passing the object to a
     594                 :            :      *  Xapian method.  Xapian will arrange to delete the object once it is no
     595                 :            :      *  longer required.
     596                 :            :      */
     597                 :            :     const Clusterer * release() const {
     598                 :            :         opt_intrusive_base::release();
     599                 :            :         return this;
     600                 :            :     }
     601                 :            : };
     602                 :            : 
     603                 :            : /** Kmeans clusterer:
     604                 :            :  *  This clusterer implements the K-Means clustering algorithm
     605                 :            :  */
     606                 :            : class XAPIAN_VISIBILITY_DEFAULT KMeans : public Clusterer {
     607                 :            :     /// Contains the initialised points that are to be clustered
     608                 :            :     std::vector<Point> points;
     609                 :            : 
     610                 :            :     /// Specifies that the clusterer needs to form 'k' clusters
     611                 :            :     unsigned int k;
     612                 :            : 
     613                 :            :     /// Specifies the maximum number of iterations that KMeans will have
     614                 :            :     unsigned int max_iters;
     615                 :            : 
     616                 :            :     /// Pointer to stopper object for identifying stopwords
     617                 :            :     Xapian::Internal::opt_intrusive_ptr<const Xapian::Stopper> stopper;
     618                 :            : 
     619                 :            :     /** Initialise 'k' clusters by selecting 'k' centroids and assigning
     620                 :            :      *  them to different clusters
     621                 :            :      *
     622                 :            :      *  @param cset             ClusterSet object to be initialised by assigning
     623                 :            :      *                          centroids to each cluster
     624                 :            :      *  @param num_of_points    Number of points passed to clusterer
     625                 :            :      */
     626                 :            :     void initialise_clusters(ClusterSet &cset, Xapian::doccount num_of_points);
     627                 :            : 
     628                 :            :     /** Initialise the Points to be fed into the Clusterer with the MSet object
     629                 :            :      *  'source'. The TF-IDF weights for the documents are calculated and stored
     630                 :            :      *  within the Points to be used later during distance calculations
     631                 :            :      *
     632                 :            :      *  @param source   MSet object containing the documents which will be
     633                 :            :      *                  used to create document vectors that are represented
     634                 :            :      *                  as Point objects
     635                 :            :      */
     636                 :            :     void initialise_points(const MSet &source);
     637                 :            : 
     638                 :            :   public:
     639                 :            :     /** Constructor specifying number of clusters and maximum iterations
     640                 :            :      *
     641                 :            :      *  @param k_               Number of required clusters
     642                 :            :      *  @param max_iters_       The maximum number of iterations for which KMeans
     643                 :            :      *                          will run if it doesn't converge
     644                 :            :      */
     645                 :            :     explicit KMeans(unsigned int k_, unsigned int max_iters_ = 0);
     646                 :            : 
     647                 :            :     /** Implements the KMeans clustering algorithm
     648                 :            :      *
     649                 :            :      *  @param mset    MSet object containing the documents that are to
     650                 :            :      *                 be clustered
     651                 :            :      */
     652                 :            :     ClusterSet cluster(const MSet &mset);
     653                 :            : 
     654                 :            :     /** Set the Xapian::Stopper object to be used for identifying stopwords.
     655                 :            :      *
     656                 :            :      *  Stopwords are discarded while calculating term frequency for terms.
     657                 :            :      *
     658                 :            :      *  @param stop     The Stopper object to set (default NULL, which means no
     659                 :            :      *                  stopwords)
     660                 :            :      */
     661                 :            :     void set_stopper(const Xapian::Stopper *stop = NULL);
     662                 :            : 
     663                 :            :     /// Return a string describing this object
     664                 :            :     std::string get_description() const;
     665                 :            : };
     666                 :            : 
     667                 :            : /** LCD clusterer:
     668                 :            :  *  This clusterer implements the LCD clustering algorithm adapted from
     669                 :            :  *  Modelling efficient novelty-based search result diversification in metric
     670                 :            :  *  spaces Gil-Costa et al. 2013
     671                 :            :  */
     672         [ -  + ]:         12 : class XAPIAN_VISIBILITY_DEFAULT LCDClusterer : public Clusterer {
     673                 :            :     /// Specifies that the clusterer needs to form 'k' clusters
     674                 :            :     unsigned int k;
     675                 :            : 
     676                 :            :   public:
     677                 :            :     /** Constructor specifying number of clusters
     678                 :            :      *
     679                 :            :      *  @param k_               Number of required clusters
     680                 :            :      */
     681                 :            :     explicit LCDClusterer(unsigned int k_);
     682                 :            : 
     683                 :            :     /** Implements the LCD clustering algorithm
     684                 :            :      *
     685                 :            :      *  @param mset    MSet object containing the documents that are to
     686                 :            :      *                 be clustered
     687                 :            :      */
     688                 :            :     ClusterSet cluster(const MSet &mset);
     689                 :            : 
     690                 :            :     /// Return a string describing this object
     691                 :            :     std::string get_description() const;
     692                 :            : };
     693                 :            : }
     694                 :            : #endif // XAPIAN_INCLUDED_CLUSTER_H

Generated by: LCOV version 1.11