LCOV - code coverage report
Current view: top level - expand - expandweight.h (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 7822d31adece Lines: 45 45 100.0 %
Date: 2019-05-23 11:15:29 Functions: 15 15 100.0 %
Branches: 15 24 62.5 %

           Branch data     Line data    Source code
       1                 :            : /** @file expandweight.h
       2                 :            :  * @brief Collate statistics and calculate the term weights for the ESet.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2007,2008,2009,2011,2016 Olly Betts
       5                 :            :  * Copyright (C) 2013 Aarsh Shah
       6                 :            :  *
       7                 :            :  * This program is free software; you can redistribute it and/or
       8                 :            :  * modify it under the terms of the GNU General Public License as
       9                 :            :  * published by the Free Software Foundation; either version 2 of the
      10                 :            :  * License, or (at your option) any later version.
      11                 :            :  *
      12                 :            :  * This program is distributed in the hope that it will be useful,
      13                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      14                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      15                 :            :  * GNU General Public License for more details.
      16                 :            :  *
      17                 :            :  * You should have received a copy of the GNU General Public License
      18                 :            :  * along with this program; if not, write to the Free Software
      19                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      20                 :            :  */
      21                 :            : 
      22                 :            : #ifndef XAPIAN_INCLUDED_EXPANDWEIGHT_H
      23                 :            : #define XAPIAN_INCLUDED_EXPANDWEIGHT_H
      24                 :            : 
      25                 :            : #include <xapian/database.h>
      26                 :            : 
      27                 :            : #include "api/termlist.h"
      28                 :            : #include "internaltypes.h"
      29                 :            : 
      30                 :            : #include <string>
      31                 :            : #include <vector>
      32                 :            : 
      33                 :            : namespace Xapian {
      34                 :            : namespace Internal {
      35                 :            : 
      36                 :            : /// Collates statistics while calculating term weight in an ESet.
      37                 :        428 : class ExpandStats {
      38                 :            :     /// Which databases in a multidb are included in termfreq.
      39                 :            :     std::vector<bool> dbs_seen;
      40                 :            : 
      41                 :            :     /// Average document length in the whole database.
      42                 :            :     Xapian::doclength avlen;
      43                 :            : 
      44                 :            :     /// The parameter k to be used for TradWeight query expansion.
      45                 :            :     double expand_k;
      46                 :            : 
      47                 :            :   public:
      48                 :            :     /// Size of the subset of a multidb to which the value in termfreq applies.
      49                 :            :     Xapian::doccount dbsize;
      50                 :            : 
      51                 :            :     /// Term frequency (for a multidb, may be for a subset of the databases).
      52                 :            :     Xapian::doccount termfreq;
      53                 :            : 
      54                 :            :     /// The number of times the term occurs in the rset.
      55                 :            :     Xapian::termcount rcollection_freq;
      56                 :            : 
      57                 :            :     /// The number of documents from the RSet indexed by the current term (r).
      58                 :            :     Xapian::doccount rtermfreq;
      59                 :            : 
      60                 :            :     /// The multiplier to be used in TradWeight query expansion.
      61                 :            :     double multiplier;
      62                 :            : 
      63                 :            :     /// Keeps track of the index of the sub-database we're accumulating for.
      64                 :            :     size_t db_index;
      65                 :            : 
      66                 :            :     /// Constructor for expansion schemes which do not require the "expand_k"
      67                 :            :     /// parameter.
      68                 :          7 :     explicit ExpandStats(Xapian::doclength avlen_)
      69                 :            :         : avlen(avlen_), expand_k(0), dbsize(0), termfreq(0),
      70                 :          7 :           rcollection_freq(0), rtermfreq(0), multiplier(0), db_index(0) {
      71                 :          7 :     }
      72                 :            : 
      73                 :            :     /// Constructor for expansion schemes which require the "expand_k" parameter.
      74                 :        207 :     ExpandStats(Xapian::doclength avlen_, double expand_k_)
      75                 :            :         : avlen(avlen_), expand_k(expand_k_), dbsize(0), termfreq(0),
      76                 :        207 :           rcollection_freq(0), rtermfreq(0), multiplier(0), db_index(0) {
      77                 :        207 :     }
      78                 :            : 
      79                 :       9560 :     void accumulate(Xapian::termcount wdf, Xapian::termcount doclen,
      80                 :            :                     Xapian::doccount subtf, Xapian::doccount subdbsize)
      81                 :            :     {
      82                 :            :         // Boolean terms may have wdf == 0, but treat that as 1 so such terms
      83                 :            :         // get a non-zero weight.
      84         [ -  + ]:       9560 :         if (wdf == 0) wdf = 1;
      85                 :       9560 :         ++rtermfreq;
      86                 :       9560 :         rcollection_freq += wdf;
      87                 :            : 
      88                 :       9560 :         multiplier += (expand_k + 1) * wdf / (expand_k * doclen / avlen + wdf);
      89                 :            : 
      90                 :            :         // If we've not seen this sub-database before, then update dbsize and
      91                 :            :         // termfreq and note that we have seen it.
      92 [ +  + ][ -  + ]:       9560 :         if (db_index >= dbs_seen.size() || !dbs_seen[db_index]) {
         [ +  + ][ +  + ]
      93         [ +  - ]:       8611 :             if (db_index >= dbs_seen.size()) dbs_seen.resize(db_index + 1);
      94                 :       8611 :             dbs_seen[db_index] = true;
      95                 :       8611 :             dbsize += subdbsize;
      96                 :       8611 :             termfreq += subtf;
      97                 :            :         }
      98                 :       9560 :     }
      99                 :            : 
     100                 :            :     /* Clear the statistics collected in the ExpandStats object before using it
     101                 :            :      * for a new term. */
     102                 :       8611 :     void clear_stats()
     103                 :            :     {
     104                 :       8611 :         dbs_seen.clear();
     105                 :       8611 :         dbsize = 0;
     106                 :       8611 :         termfreq = 0;
     107                 :       8611 :         rcollection_freq = 0;
     108                 :       8611 :         rtermfreq = 0;
     109                 :       8611 :         multiplier = 0;
     110                 :       8611 :         db_index = 0;
     111                 :       8611 :     }
     112                 :            : };
     113                 :            : 
     114                 :            : /// Class for calculating ESet term weights.
     115                 :        428 : class ExpandWeight {
     116                 :            :     /// The combined database.
     117                 :            :     const Xapian::Database db;
     118                 :            : 
     119                 :            :     /// The number of documents in the whole database.
     120                 :            :     Xapian::doccount dbsize;
     121                 :            : 
     122                 :            :     /// Average document length in the whole database.
     123                 :            :     Xapian::doclength avlen;
     124                 :            : 
     125                 :            :     /// The number of documents in the RSet.
     126                 :            :     Xapian::doccount rsize;
     127                 :            : 
     128                 :            :     /// The collection frequency of the term.
     129                 :            :     Xapian::termcount collection_freq;
     130                 :            : 
     131                 :            :     /// The total length of the database.
     132                 :            :     Xapian::totallength collection_len;
     133                 :            : 
     134                 :            :     /** Should we calculate the exact term frequency when generating an ESet?
     135                 :            :      *
     136                 :            :      *  This only has any effect if we're using a combined database.
     137                 :            :      *
     138                 :            :      *  If this member is true, the exact term frequency will be obtained from
     139                 :            :      *  the Database object.  If this member is false, then an approximation is
     140                 :            :      *  used to estimate the term frequency based on the term frequencies in
     141                 :            :      *  the sub-databases which we see while collating term statistics, and the
     142                 :            :      *  relative sizes of the sub-databases.
     143                 :            :      */
     144                 :            :     bool use_exact_termfreq;
     145                 :            : 
     146                 :            :   public:
     147                 :            :     /** Constructor.
     148                 :            :      *
     149                 :            :      *  @param db_ The database.
     150                 :            :      *  @param rsize_ The number of documents in the RSet.
     151                 :            :      *  @param use_exact_termfreq_ When expanding over a combined database,
     152                 :            :      *                             should we use the exact termfreq (if false
     153                 :            :      *                             a cheaper approximation is used).
     154                 :            :      */
     155                 :          7 :     ExpandWeight(const Xapian::Database &db_,
     156                 :            :                  Xapian::doccount rsize_,
     157                 :            :                  bool use_exact_termfreq_)
     158 [ +  - ][ +  - ]:          7 :         : db(db_), dbsize(db.get_doccount()), avlen(db.get_avlength()),
     159                 :            :           rsize(rsize_), collection_freq(0),
     160                 :          7 :           collection_len(avlen * dbsize + .5),
     161         [ +  - ]:         14 :           use_exact_termfreq(use_exact_termfreq_), stats(avlen) {}
     162                 :            : 
     163                 :            :     /** Constructor.
     164                 :            :      *
     165                 :            :      *  @param db_ The database.
     166                 :            :      *  @param rsize_ The number of documents in the RSet.
     167                 :            :      *  @param use_exact_termfreq_ When expanding over a combined database,
     168                 :            :      *                             should we use the exact termfreq (if false
     169                 :            :      *                             a cheaper approximation is used).
     170                 :            :      *  @param expand_k_ The parameter for TradWeight query expansion.
     171                 :            :      */
     172                 :        207 :     ExpandWeight(const Xapian::Database &db_,
     173                 :            :                  Xapian::doccount rsize_,
     174                 :            :                  bool use_exact_termfreq_,
     175                 :            :                  double expand_k_)
     176 [ +  - ][ +  - ]:        207 :         : db(db_), dbsize(db.get_doccount()), avlen(db.get_avlength()),
     177                 :            :           rsize(rsize_), collection_freq(0),
     178                 :        207 :           collection_len(avlen * dbsize + .5),
     179         [ +  - ]:        414 :           use_exact_termfreq(use_exact_termfreq_), stats(avlen, expand_k_) {}
     180                 :            : 
     181                 :            :     /** Get the term statistics.
     182                 :            :      *  @param merger The tree of TermList objects.
     183                 :            :      *  @param term The current term name.
     184                 :            :      */
     185                 :            :     void collect_stats(TermList * merger, const std::string & term);
     186                 :            : 
     187                 :            :     /// Calculate the weight.
     188                 :            :     virtual double get_weight() const = 0;
     189                 :            : 
     190                 :            :   protected:
     191                 :            :     /// An ExpandStats object to accumulate statistics.
     192                 :            :     ExpandStats stats;
     193                 :            : 
     194                 :            :     /// Return the average length of the database.
     195                 :            :     double get_avlen() const { return avlen; }
     196                 :            : 
     197                 :            :     /// Return the number of documents in the RSet.
     198                 :      16284 :     Xapian::doccount get_rsize() const { return rsize; }
     199                 :            : 
     200                 :            :     /// Return the collection frequency of the term.
     201                 :        938 :     Xapian::termcount get_collection_freq() const { return collection_freq; }
     202                 :            : 
     203                 :            :     /// Return the length of the collection.
     204                 :            :     Xapian::totallength get_collection_len() const { return collection_len; }
     205                 :            : 
     206                 :            :     /// Return the size of the database.
     207                 :      17222 :     Xapian::doccount get_dbsize() const { return dbsize; }
     208                 :            : };
     209                 :            : 
     210                 :            : /** This class implements the TradWeight scheme for query expansion.
     211                 :            :  *
     212                 :            :  *  It is the default scheme for query expansion.
     213                 :            :  */
     214                 :        414 : class TradEWeight : public ExpandWeight {
     215                 :            :   public:
     216                 :            :     /** Constructor.
     217                 :            :      *
     218                 :            :      *  @param db_ The database.
     219                 :            :      *  @param rsize_ The number of documents in the RSet.
     220                 :            :      *  @param use_exact_termfreq_ When expanding over a combined database,
     221                 :            :      *                             should we use the exact termfreq (if false
     222                 :            :      *                             a cheaper approximation is used).
     223                 :            :      *  @param expand_k_ The parameter for TradWeight query expansion.
     224                 :            :      *
     225                 :            :      *  All the parameters are passed to the parent ExpandWeight object.
     226                 :            :      */
     227                 :        207 :     TradEWeight(const Xapian::Database &db_,
     228                 :            :                 Xapian::doccount rsize_,
     229                 :            :                 bool use_exact_termfreq_,
     230                 :            :                 double expand_k_)
     231                 :        207 :         : ExpandWeight(db_, rsize_, use_exact_termfreq_, expand_k_) { }
     232                 :            : 
     233                 :            :     double get_weight() const;
     234                 :            : };
     235                 :            : 
     236                 :            : /** This class implements the Bo1 scheme for query expansion.
     237                 :            :  *
     238                 :            :  *  Bo1 is a representative scheme of the Divergence from Randomness Framework
     239                 :            :  *  by Gianni Amati.
     240                 :            :  *
     241                 :            :  *  This is a parameter free weighting scheme for query expansion and it uses
     242                 :            :  *  the Bose-Einstein probabilistic distribution.
     243                 :            :  *
     244                 :            :  *  For more information about the DFR Framework and the Bo1 scheme, please
     245                 :            :  *  refer to Gianni Amati's PHD thesis.
     246                 :            :  */
     247                 :         14 : class Bo1EWeight : public ExpandWeight {
     248                 :            :   public:
     249                 :            :     /** Constructor.
     250                 :            :      *
     251                 :            :      *  @param db_ The database.
     252                 :            :      *  @param rsize_ The number of documents in the RSet.
     253                 :            :      *  @param use_exact_termfreq_ When expanding over a combined database,
     254                 :            :      *                             should we use the exact termfreq (if false
     255                 :            :      *                             a cheaper approximation is used).
     256                 :            :      *
     257                 :            :      *  All the parameters are passed to the parent ExpandWeight object.
     258                 :            :      */
     259                 :          7 :     Bo1EWeight(const Xapian::Database &db_,
     260                 :            :                Xapian::doccount rsize_,
     261                 :            :                bool use_exact_termfreq_)
     262                 :          7 :         : ExpandWeight(db_, rsize_, use_exact_termfreq_) {}
     263                 :            : 
     264                 :            :     double get_weight() const;
     265                 :            : };
     266                 :            : 
     267                 :            : }
     268                 :            : }
     269                 :            : 
     270                 :            : #endif // XAPIAN_INCLUDED_EXPANDWEIGHT_H

Generated by: LCOV version 1.11