LCOV - code coverage report
Current view: top level - include/xapian - weight.h (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 954b5873a738 Lines: 253 257 98.4 %
Date: 2019-06-30 05:20:33 Functions: 50 50 100.0 %
Branches: 67 98 68.4 %

           Branch data     Line data    Source code
       1                 :            : /** @file weight.h
       2                 :            :  * @brief Weighting scheme API.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2004,2007,2008,2009,2010,2011,2012,2015,2016,2017 Olly Betts
       5                 :            :  * Copyright (C) 2009 Lemur Consulting Ltd
       6                 :            :  * Copyright (C) 2013,2014 Aarsh Shah
       7                 :            :  * Copyright (C) 2016,2017 Vivek Pal
       8                 :            :  *
       9                 :            :  * This program is free software; you can redistribute it and/or
      10                 :            :  * modify it under the terms of the GNU General Public License as
      11                 :            :  * published by the Free Software Foundation; either version 2 of the
      12                 :            :  * License, or (at your option) any later version.
      13                 :            :  *
      14                 :            :  * This program is distributed in the hope that it will be useful,
      15                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      16                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      17                 :            :  * GNU General Public License for more details.
      18                 :            :  *
      19                 :            :  * You should have received a copy of the GNU General Public License
      20                 :            :  * along with this program; if not, write to the Free Software
      21                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      22                 :            :  */
      23                 :            : 
      24                 :            : #ifndef XAPIAN_INCLUDED_WEIGHT_H
      25                 :            : #define XAPIAN_INCLUDED_WEIGHT_H
      26                 :            : 
      27                 :            : #include <string>
      28                 :            : 
      29                 :            : #include <xapian/registry.h>
      30                 :            : #include <xapian/types.h>
      31                 :            : #include <xapian/visibility.h>
      32                 :            : 
      33                 :            : namespace Xapian {
      34                 :            : 
      35                 :            : /** Abstract base class for weighting schemes. */
      36                 :            : class XAPIAN_VISIBILITY_DEFAULT Weight {
      37                 :            :   protected:
      38                 :            :     /// Stats which the weighting scheme can use (see @a need_stat()).
      39                 :            :     typedef enum {
      40                 :            :         /// Number of documents in the collection.
      41                 :            :         COLLECTION_SIZE = 1,
      42                 :            :         /// Number of documents in the RSet.
      43                 :            :         RSET_SIZE = 2,
      44                 :            :         /// Average length of documents in the collection.
      45                 :            :         AVERAGE_LENGTH = 4,
      46                 :            :         /// How many documents the current term is in.
      47                 :            :         TERMFREQ = 8,
      48                 :            :         /// How many documents in the RSet the current term is in.
      49                 :            :         RELTERMFREQ = 16,
      50                 :            :         /// Sum of wqf for terms in the query.
      51                 :            :         QUERY_LENGTH = 32,
      52                 :            :         /// Within-query-frequency of the current term.
      53                 :            :         WQF = 64,
      54                 :            :         /// Within-document-frequency of the current term in the current document.
      55                 :            :         WDF = 128,
      56                 :            :         /// Length of the current document (sum wdf).
      57                 :            :         DOC_LENGTH = 256,
      58                 :            :         /// Lower bound on (non-zero) document lengths.
      59                 :            :         DOC_LENGTH_MIN = 512,
      60                 :            :         /// Upper bound on document lengths.
      61                 :            :         DOC_LENGTH_MAX = 1024,
      62                 :            :         /// Upper bound on wdf.
      63                 :            :         WDF_MAX = 2048,
      64                 :            :         /// Sum of wdf over the whole collection for the current term.
      65                 :            :         COLLECTION_FREQ = 4096,
      66                 :            :         /// Number of unique terms in the current document.
      67                 :            :         UNIQUE_TERMS = 8192
      68                 :            :     } stat_flags;
      69                 :            : 
      70                 :            :     /** Tell Xapian that your subclass will want a particular statistic.
      71                 :            :      *
      72                 :            :      *  Some of the statistics can be costly to fetch or calculate, so
      73                 :            :      *  Xapian needs to know which are actually going to be used.  You
      74                 :            :      *  should call need_stat() from your constructor for each statistic
      75                 :            :      *  needed by the weighting scheme you are implementing (possibly
      76                 :            :      *  conditional on the values of parameters of the weighting scheme).
      77                 :            :      *
      78                 :            :      *  Prior to 1.5.0, it was assumed that if get_maxextra() returned
      79                 :            :      *  a non-zero value then get_sumextra() needed the document length even if
      80                 :            :      *  need(DOC_LENGTH) wasn't called - the logic was that get_sumextra() could
      81                 :            :      *  only return a constant value if it didn't use the document length.
      82                 :            :      *  However, this is no longer valid since it can also use the number of
      83                 :            :      *  unique terms in the document, so now you need to specify explicitly.
      84                 :            :      *
      85                 :            :      * @param flag  The stat_flags value for a required statistic.
      86                 :            :      */
      87                 :    5344428 :     void need_stat(stat_flags flag) {
      88                 :    5344428 :         stats_needed = stat_flags(stats_needed | flag);
      89                 :    5344428 :     }
      90                 :            : 
      91                 :            :     /** Allow the subclass to perform any initialisation it needs to.
      92                 :            :      *
      93                 :            :      *  @param factor     Any scaling factor (e.g. from OP_SCALE_WEIGHT).
      94                 :            :      *                    If the Weight object is for the term-independent
      95                 :            :      *                    weight supplied by get_sumextra()/get_maxextra(),
      96                 :            :      *                    then init(0.0) is called (starting from Xapian
      97                 :            :      *                    1.2.11 and 1.3.1 - earlier versions failed to
      98                 :            :      *                    call init() for such Weight objects).
      99                 :            :      */
     100                 :            :     virtual void init(double factor) = 0;
     101                 :            : 
     102                 :            :   private:
     103                 :            :     /// Don't allow assignment.
     104                 :            :     void operator=(const Weight &);
     105                 :            : 
     106                 :            :     /// A bitmask of the statistics this weighting scheme needs.
     107                 :            :     stat_flags stats_needed;
     108                 :            : 
     109                 :            :     /// The number of documents in the collection.
     110                 :            :     Xapian::doccount collection_size_;
     111                 :            : 
     112                 :            :     /// The number of documents marked as relevant.
     113                 :            :     Xapian::doccount rset_size_;
     114                 :            : 
     115                 :            :     /// The average length of a document in the collection.
     116                 :            :     Xapian::doclength average_length_;
     117                 :            : 
     118                 :            :     /// The number of documents which this term indexes.
     119                 :            :     Xapian::doccount termfreq_;
     120                 :            : 
     121                 :            :     // The collection frequency of the term.
     122                 :            :     Xapian::termcount collectionfreq_;
     123                 :            : 
     124                 :            :     /// The number of relevant documents which this term indexes.
     125                 :            :     Xapian::doccount reltermfreq_;
     126                 :            : 
     127                 :            :     /// The length of the query.
     128                 :            :     Xapian::termcount query_length_;
     129                 :            : 
     130                 :            :     /// The within-query-frequency of this term.
     131                 :            :     Xapian::termcount wqf_;
     132                 :            : 
     133                 :            :     /// A lower bound on the minimum length of any document in the database.
     134                 :            :     Xapian::termcount doclength_lower_bound_;
     135                 :            : 
     136                 :            :     /// An upper bound on the maximum length of any document in the database.
     137                 :            :     Xapian::termcount doclength_upper_bound_;
     138                 :            : 
     139                 :            :     /// An upper bound on the wdf of this term.
     140                 :            :     Xapian::termcount wdf_upper_bound_;
     141                 :            : 
     142                 :            :   public:
     143                 :            : 
     144                 :            :     /// Default constructor, needed by subclass constructors.
     145                 :     540838 :     Weight() : stats_needed() { }
     146                 :            : 
     147                 :            :     /** Type of smoothing to use with the Language Model Weighting scheme.
     148                 :            :      *
     149                 :            :      *  Default is TWO_STAGE_SMOOTHING.
     150                 :            :      */
     151                 :            :     typedef enum {
     152                 :            :         TWO_STAGE_SMOOTHING = 1,
     153                 :            :         DIRICHLET_SMOOTHING = 2,
     154                 :            :         ABSOLUTE_DISCOUNT_SMOOTHING = 3,
     155                 :            :         JELINEK_MERCER_SMOOTHING = 4,
     156                 :            :         DIRICHLET_PLUS_SMOOTHING = 5
     157                 :            :     } type_smoothing;
     158                 :            : 
     159                 :            :     class Internal;
     160                 :            : 
     161                 :            :     /** Virtual destructor, because we have virtual methods. */
     162                 :            :     virtual ~Weight();
     163                 :            : 
     164                 :            :     /** Clone this object.
     165                 :            :      *
     166                 :            :      *  This method allocates and returns a copy of the object it is called on.
     167                 :            :      *
     168                 :            :      *  If your subclass is called FooWeight and has parameters a and b, then
     169                 :            :      *  you would implement FooWeight::clone() like so:
     170                 :            :      *
     171                 :            :      *  FooWeight * FooWeight::clone() const { return new FooWeight(a, b); }
     172                 :            :      *
     173                 :            :      *  Note that the returned object will be deallocated by Xapian after use
     174                 :            :      *  with "delete".  If you want to handle the deletion in a special way
     175                 :            :      *  (for example when wrapping the Xapian API for use from another
     176                 :            :      *  language) then you can define a static <code>operator delete</code>
     177                 :            :      *  method in your subclass as shown here:
     178                 :            :      *  https://trac.xapian.org/ticket/554#comment:1
     179                 :            :      */
     180                 :            :     virtual Weight * clone() const = 0;
     181                 :            : 
     182                 :            :     /** Return the name of this weighting scheme.
     183                 :            :      *
     184                 :            :      *  This name is used by the remote backend.  It is passed along with the
     185                 :            :      *  serialised parameters to the remote server so that it knows which class
     186                 :            :      *  to create.
     187                 :            :      *
     188                 :            :      *  Return the full namespace-qualified name of your class here - if
     189                 :            :      *  your class is called FooWeight, return "FooWeight" from this method
     190                 :            :      *  (Xapian::BM25Weight returns "Xapian::BM25Weight" here).
     191                 :            :      *
     192                 :            :      *  If you don't want to support the remote backend, you can use the
     193                 :            :      *  default implementation which simply returns an empty string.
     194                 :            :      */
     195                 :            :     virtual std::string name() const;
     196                 :            : 
     197                 :            :     /** Return this object's parameters serialised as a single string.
     198                 :            :      *
     199                 :            :      *  If you don't want to support the remote backend, you can use the
     200                 :            :      *  default implementation which simply throws Xapian::UnimplementedError.
     201                 :            :      */
     202                 :            :     virtual std::string serialise() const;
     203                 :            : 
     204                 :            :     /** Unserialise parameters.
     205                 :            :      *
     206                 :            :      *  This method unserialises parameters serialised by the @a serialise()
     207                 :            :      *  method and allocates and returns a new object initialised with them.
     208                 :            :      *
     209                 :            :      *  If you don't want to support the remote backend, you can use the
     210                 :            :      *  default implementation which simply throws Xapian::UnimplementedError.
     211                 :            :      *
     212                 :            :      *  Note that the returned object will be deallocated by Xapian after use
     213                 :            :      *  with "delete".  If you want to handle the deletion in a special way
     214                 :            :      *  (for example when wrapping the Xapian API for use from another
     215                 :            :      *  language) then you can define a static <code>operator delete</code>
     216                 :            :      *  method in your subclass as shown here:
     217                 :            :      *  https://trac.xapian.org/ticket/554#comment:1
     218                 :            :      *
     219                 :            :      *  @param serialised       A string containing the serialised parameters.
     220                 :            :      */
     221                 :            :     virtual Weight * unserialise(const std::string & serialised) const;
     222                 :            : 
     223                 :            :     /** Calculate the weight contribution for this object's term to a document.
     224                 :            :      *
     225                 :            :      *  The parameters give information about the document which may be used
     226                 :            :      *  in the calculations:
     227                 :            :      *
     228                 :            :      *  @param wdf    The within document frequency of the term in the document.
     229                 :            :      *  @param doclen The document's length (unnormalised).
     230                 :            :      *  @param uniqterms        Number of unique terms in the document (used
     231                 :            :      *                          for absolute smoothing).
     232                 :            :      */
     233                 :            :     virtual double get_sumpart(Xapian::termcount wdf,
     234                 :            :                                Xapian::termcount doclen,
     235                 :            :                                Xapian::termcount uniqterms) const = 0;
     236                 :            : 
     237                 :            :     /** Return an upper bound on what get_sumpart() can return for any document.
     238                 :            :      *
     239                 :            :      *  This information is used by the matcher to perform various
     240                 :            :      *  optimisations, so strive to make the bound as tight as possible.
     241                 :            :      */
     242                 :            :     virtual double get_maxpart() const = 0;
     243                 :            : 
     244                 :            :     /** Calculate the term-independent weight component for a document.
     245                 :            :      *
     246                 :            :      *  The parameter gives information about the document which may be used
     247                 :            :      *  in the calculations:
     248                 :            :      *
     249                 :            :      *  @param doclen The document's length (unnormalised).
     250                 :            :      *  @param uniqterms The number of unique terms in the document.
     251                 :            :      */
     252                 :            :     virtual double get_sumextra(Xapian::termcount doclen,
     253                 :            :                                 Xapian::termcount uniqterms) const = 0;
     254                 :            : 
     255                 :            :     /** Return an upper bound on what get_sumextra() can return for any
     256                 :            :      *  document.
     257                 :            :      *
     258                 :            :      *  This information is used by the matcher to perform various
     259                 :            :      *  optimisations, so strive to make the bound as tight as possible.
     260                 :            :      */
     261                 :            :     virtual double get_maxextra() const = 0;
     262                 :            : 
     263                 :            :     /** @private @internal Initialise this object to calculate weights for term
     264                 :            :      *  @a term.
     265                 :            :      *
     266                 :            :      *  @param stats      Source of statistics.
     267                 :            :      *  @param query_len_ Query length.
     268                 :            :      *  @param term       The term for the new object.
     269                 :            :      *  @param wqf_       The within-query-frequency of @a term.
     270                 :            :      *  @param factor     Any scaling factor (e.g. from OP_SCALE_WEIGHT).
     271                 :            :      */
     272                 :            :     XAPIAN_VISIBILITY_INTERNAL
     273                 :            :     void init_(const Internal & stats, Xapian::termcount query_len_,
     274                 :            :                const std::string & term, Xapian::termcount wqf_,
     275                 :            :                double factor);
     276                 :            : 
     277                 :            :     /** @private @internal Initialise this object to calculate weights for a
     278                 :            :      *  synonym.
     279                 :            :      *
     280                 :            :      *  @param stats       Source of statistics.
     281                 :            :      *  @param query_len_  Query length.
     282                 :            :      *  @param factor      Any scaling factor (e.g. from OP_SCALE_WEIGHT).
     283                 :            :      *  @param termfreq    The termfreq to use.
     284                 :            :      *  @param reltermfreq The reltermfreq to use.
     285                 :            :      *  @param collection_freq The collection frequency to use.
     286                 :            :      */
     287                 :            :     XAPIAN_VISIBILITY_INTERNAL
     288                 :            :     void init_(const Internal & stats, Xapian::termcount query_len_,
     289                 :            :                double factor, Xapian::doccount termfreq,
     290                 :            :                Xapian::doccount reltermfreq, Xapian::termcount collection_freq);
     291                 :            : 
     292                 :            :     /** @private @internal Initialise this object to calculate the extra weight
     293                 :            :      *  component.
     294                 :            :      *
     295                 :            :      *  @param stats      Source of statistics.
     296                 :            :      *  @param query_len_ Query length.
     297                 :            :      */
     298                 :            :     XAPIAN_VISIBILITY_INTERNAL
     299                 :            :     void init_(const Internal & stats, Xapian::termcount query_len_);
     300                 :            : 
     301                 :            :     /** @private @internal Return true if the document length is needed.
     302                 :            :      *
     303                 :            :      *  If this method returns true, then the document length will be fetched
     304                 :            :      *  and passed to @a get_sumpart().  Otherwise 0 may be passed for the
     305                 :            :      *  document length.
     306                 :            :      */
     307                 :     150102 :     bool get_sumpart_needs_doclength_() const {
     308                 :     150102 :         return stats_needed & DOC_LENGTH;
     309                 :            :     }
     310                 :            : 
     311                 :            :     /** @private @internal Return true if the WDF is needed.
     312                 :            :      *
     313                 :            :      *  If this method returns true, then the WDF will be fetched and passed to
     314                 :            :      *  @a get_sumpart().  Otherwise 0 may be passed for the wdf.
     315                 :            :      */
     316                 :     313059 :     bool get_sumpart_needs_wdf_() const {
     317                 :     313059 :         return stats_needed & WDF;
     318                 :            :     }
     319                 :            : 
     320                 :            :     /** @private @internal Return true if the number of unique terms is needed.
     321                 :            :      *
     322                 :            :      *  If this method returns true, then the number of unique terms will be
     323                 :            :      *  fetched and passed to @a get_sumpart().  Otherwise 0 may be passed for
     324                 :            :      *  the number of unique terms.
     325                 :            :      */
     326                 :     150102 :     bool get_sumpart_needs_uniqueterms_() const {
     327                 :     150102 :         return stats_needed & UNIQUE_TERMS;
     328                 :            :     }
     329                 :            : 
     330                 :            :     /** Return the appropriate weighting scheme object.
     331                 :            :      *
     332                 :            :      *  @param scheme   the string containing a weighting scheme name and may
     333                 :            :      *                  also contain the parameters required by that weighting
     334                 :            :      *                  scheme. E.g. "bm25 1.0 0.8"
     335                 :            :      *  @param reg      Xapian::Registry object to allow users to add their own
     336                 :            :      *                  custom weighting schemes (default: standard registry).
     337                 :            :      */
     338                 :            :     static const Weight * create(const std::string & scheme,
     339                 :            :                                  const Registry & reg = Registry());
     340                 :            : 
     341                 :            :     /** Return the parameterised weighting scheme object.
     342                 :            :      *
     343                 :            :      * @param params    the pointer to the string containing parameter values
     344                 :            :      *                  for a weighting scheme
     345                 :            :      */
     346                 :            :     virtual Weight * create_from_parameters(const char * params) const;
     347                 :            : 
     348                 :            :     /** Return the short name of the weighting scheme. E.g. "bm25". */
     349                 :            :     virtual std::string short_name() const;
     350                 :            : 
     351                 :            :   protected:
     352                 :            :     /** Don't allow copying.
     353                 :            :      *
     354                 :            :      *  This would ideally be private, but that causes a compilation error
     355                 :            :      *  with GCC 4.1 (which appears to be a bug).
     356                 :            :      */
     357                 :            :     XAPIAN_VISIBILITY_INTERNAL
     358                 :            :     Weight(const Weight &);
     359                 :            : 
     360                 :            :     /// The number of documents in the collection.
     361                 :     976896 :     Xapian::doccount get_collection_size() const { return collection_size_; }
     362                 :            : 
     363                 :            :     /// The number of documents marked as relevant.
     364                 :     976120 :     Xapian::doccount get_rset_size() const { return rset_size_; }
     365                 :            : 
     366                 :            :     /// The average length of a document in the collection.
     367                 :     978000 :     Xapian::doclength get_average_length() const { return average_length_; }
     368                 :            : 
     369                 :            :     /// The number of documents which this term indexes.
     370                 :     978836 :     Xapian::doccount get_termfreq() const { return termfreq_; }
     371                 :            : 
     372                 :            :     /// The number of relevant documents which this term indexes.
     373                 :       5562 :     Xapian::doccount get_reltermfreq() const { return reltermfreq_; }
     374                 :            : 
     375                 :            :     /// The collection frequency of the term.
     376                 :       8706 :     Xapian::termcount get_collection_freq() const { return collectionfreq_; }
     377                 :            : 
     378                 :            :     /// The length of the query.
     379                 :       5686 :     Xapian::termcount get_query_length() const { return query_length_; }
     380                 :            : 
     381                 :            :     /// The within-query-frequency of this term.
     382                 :     976304 :     Xapian::termcount get_wqf() const { return wqf_; }
     383                 :            : 
     384                 :            :     /** An upper bound on the maximum length of any document in the database.
     385                 :            :      *
     386                 :            :      *  This should only be used by get_maxpart() and get_maxextra().
     387                 :            :      */
     388                 :       2128 :     Xapian::termcount get_doclength_upper_bound() const {
     389                 :       2128 :         return doclength_upper_bound_;
     390                 :            :     }
     391                 :            : 
     392                 :            :     /** A lower bound on the minimum length of any document in the database.
     393                 :            :      *
     394                 :            :      *  This bound does not include any zero-length documents.
     395                 :            :      *
     396                 :            :      *  This should only be used by get_maxpart() and get_maxextra().
     397                 :            :      */
     398                 :     823172 :     Xapian::termcount get_doclength_lower_bound() const {
     399                 :     823172 :         return doclength_lower_bound_;
     400                 :            :     }
     401                 :            : 
     402                 :            :     /** An upper bound on the wdf of this term.
     403                 :            :      *
     404                 :            :      *  This should only be used by get_maxpart() and get_maxextra().
     405                 :            :      */
     406                 :    1644441 :     Xapian::termcount get_wdf_upper_bound() const {
     407                 :    1644441 :         return wdf_upper_bound_;
     408                 :            :     }
     409                 :            : };
     410                 :            : 
     411                 :            : /** Class implementing a "boolean" weighting scheme.
     412                 :            :  *
     413                 :            :  *  This weighting scheme gives all documents zero weight.
     414                 :            :  */
     415         [ -  + ]:      13036 : class XAPIAN_VISIBILITY_DEFAULT BoolWeight : public Weight {
     416                 :            :     BoolWeight * clone() const;
     417                 :            : 
     418                 :            :     void init(double factor);
     419                 :            : 
     420                 :            :   public:
     421                 :            :     /** Construct a BoolWeight. */
     422                 :       6660 :     BoolWeight() { }
     423                 :            : 
     424                 :            :     std::string name() const;
     425                 :            :     std::string short_name() const;
     426                 :            : 
     427                 :            :     std::string serialise() const;
     428                 :            :     BoolWeight * unserialise(const std::string & serialised) const;
     429                 :            : 
     430                 :            :     double get_sumpart(Xapian::termcount wdf,
     431                 :            :                        Xapian::termcount doclen,
     432                 :            :                        Xapian::termcount uniqterms) const;
     433                 :            :     double get_maxpart() const;
     434                 :            : 
     435                 :            :     double get_sumextra(Xapian::termcount doclen,
     436                 :            :                         Xapian::termcount uniqterms) const;
     437                 :            :     double get_maxextra() const;
     438                 :            : 
     439                 :            :     BoolWeight * create_from_parameters(const char * params) const;
     440                 :            : };
     441                 :            : 
     442                 :            : /// Xapian::Weight subclass implementing the tf-idf weighting scheme.
     443         [ -  + ]:       8082 : class XAPIAN_VISIBILITY_DEFAULT TfIdfWeight : public Weight {
     444                 :            :     /* Three character string indicating the normalizations for tf(wdf), idf and
     445                 :            :        tfidf weight. */
     446                 :            :     std::string normalizations;
     447                 :            : 
     448                 :            :     /// The factor to multiply with the weight.
     449                 :            :     double wqf_factor;
     450                 :            : 
     451                 :            :     /// Normalised IDF value (document-independent).
     452                 :            :     double idfn;
     453                 :            : 
     454                 :            :     /// Parameters slope and delta in the Piv+ normalization weighting formula.
     455                 :            :     double param_slope, param_delta;
     456                 :            : 
     457                 :            :     TfIdfWeight * clone() const;
     458                 :            : 
     459                 :            :     void init(double factor);
     460                 :            : 
     461                 :            :     /* When additional normalizations are implemented in the future, the additional statistics for them
     462                 :            :        should be accessed by these functions. */
     463                 :            :     double get_wdfn(Xapian::termcount wdf,
     464                 :            :                     Xapian::termcount len,
     465                 :            :                     Xapian::termcount uniqterms, char c) const;
     466                 :            :     double get_idfn(char c) const;
     467                 :            :     double get_wtn(double wt, char c) const;
     468                 :            : 
     469                 :            :   public:
     470                 :            :     /** Construct a TfIdfWeight
     471                 :            :      *
     472                 :            :      *  @param normalizations   A three character string indicating the
     473                 :            :      *                          normalizations to be used for the tf(wdf), idf
     474                 :            :      *                          and document weight.  (default: "ntn")
     475                 :            :      *
     476                 :            :      * The @a normalizations string works like so:
     477                 :            :      *
     478                 :            :      * @li The first character specifies the normalization for the wdf.  The
     479                 :            :      *     following normalizations are currently supported:
     480                 :            :      *
     481                 :            :      *     @li 'n': None.      wdfn=wdf
     482                 :            :      *     @li 'b': Boolean    wdfn=1 if term in document else wdfn=0
     483                 :            :      *     @li 's': Square     wdfn=wdf*wdf
     484                 :            :      *     @li 'l': Logarithmic wdfn=1+log<sub>e</sub>(wdf)
     485                 :            :      *     @li 'P': Pivoted     wdfn=(1+log(1+log(wdf)))*(1/(1-slope+(slope*doclen/avg_len)))+delta
     486                 :            :      *     @li 'L': Log average wdfn=(1+log(wdf))/(1+log(doclen/unique_terms))
     487                 :            :      *
     488                 :            :      *     The Max-wdf and Augmented Max wdf normalizations haven't yet been
     489                 :            :      *     implemented.
     490                 :            :      *
     491                 :            :      * @li The second character indicates the normalization for the idf.  The
     492                 :            :      *     following normalizations are currently supported:
     493                 :            :      *
     494                 :            :      *     @li 'n': None    idfn=1
     495                 :            :      *     @li 't': TfIdf   idfn=log(N/Termfreq) where N is the number of
     496                 :            :      *         documents in collection and Termfreq is the number of documents
     497                 :            :      *         which are indexed by the term t.
     498                 :            :      *     @li 'p': Prob    idfn=log((N-Termfreq)/Termfreq)
     499                 :            :      *     @li 'f': Freq    idfn=1/Termfreq
     500                 :            :      *     @li 's': Squared idfn=log(N/Termfreq)^2
     501                 :            :      *     @li 'P': Pivoted idfn=log((N+1)/Termfreq)
     502                 :            :      *
     503                 :            :      * @li The third and the final character indicates the normalization for
     504                 :            :      *     the document weight.  The following normalizations are currently
     505                 :            :      *     supported:
     506                 :            :      *
     507                 :            :      *     @li 'n': None wtn=tfn*idfn
     508                 :            :      *
     509                 :            :      * Implementing support for more normalizations of each type would require
     510                 :            :      * extending the backend to track more statistics.
     511                 :            :      */
     512                 :            :     explicit TfIdfWeight(const std::string &normalizations);
     513                 :            : 
     514                 :            :     /** Construct a TfIdfWeight
     515                 :            :      *
     516                 :            :      *  @param normalizations   A three character string indicating the
     517                 :            :      *                          normalizations to be used for the tf(wdf), idf
     518                 :            :      *                          and document weight.  (default: "ntn")
     519                 :            :      *  @param slope            Extra parameter for "Pivoted" tf normalization.  (default: 0.2)
     520                 :            :      *  @param delta            Extra parameter for "Pivoted" tf normalization.  (default: 1.0)
     521                 :            :      *
     522                 :            :      * The @a normalizations string works like so:
     523                 :            :      *
     524                 :            :      * @li The first character specifies the normalization for the wdf.  The
     525                 :            :      *     following normalizations are currently supported:
     526                 :            :      *
     527                 :            :      *     @li 'n': None.      wdfn=wdf
     528                 :            :      *     @li 'b': Boolean    wdfn=1 if term in document else wdfn=0
     529                 :            :      *     @li 's': Square     wdfn=wdf*wdf
     530                 :            :      *     @li 'l': Logarithmic wdfn=1+log<sub>e</sub>(wdf)
     531                 :            :      *     @li 'P': Pivoted     wdfn=(1+log(1+log(wdf)))*(1/(1-slope+(slope*doclen/avg_len)))+delta
     532                 :            :      *
     533                 :            :      *     The Max-wdf and Augmented Max wdf normalizations haven't yet been
     534                 :            :      *     implemented.
     535                 :            :      *
     536                 :            :      * @li The second character indicates the normalization for the idf.  The
     537                 :            :      *     following normalizations are currently supported:
     538                 :            :      *
     539                 :            :      *     @li 'n': None    idfn=1
     540                 :            :      *     @li 't': TfIdf   idfn=log(N/Termfreq) where N is the number of
     541                 :            :      *         documents in collection and Termfreq is the number of documents
     542                 :            :      *         which are indexed by the term t.
     543                 :            :      *     @li 'p': Prob    idfn=log((N-Termfreq)/Termfreq)
     544                 :            :      *     @li 'f': Freq    idfn=1/Termfreq
     545                 :            :      *     @li 's': Squared idfn=log(N/Termfreq)^2
     546                 :            :      *     @li 'P': Pivoted idfn=log((N+1)/Termfreq)
     547                 :            :      *
     548                 :            :      * @li The third and the final character indicates the normalization for
     549                 :            :      *     the document weight.  The following normalizations are currently
     550                 :            :      *     supported:
     551                 :            :      *
     552                 :            :      *     @li 'n': None wtn=tfn*idfn
     553                 :            :      *
     554                 :            :      * Implementing support for more normalizations of each type would require
     555                 :            :      * extending the backend to track more statistics.
     556                 :            :      */
     557                 :            :     TfIdfWeight(const std::string &normalizations, double slope, double delta);
     558                 :            : 
     559                 :            :     /** Construct a TfIdfWeight using the default normalizations ("ntn"). */
     560                 :       1564 :     TfIdfWeight()
     561         [ +  - ]:       1564 :         : normalizations("ntn"), param_slope(0.2), param_delta(1.0)
     562                 :            :     {
     563                 :       1564 :         need_stat(TERMFREQ);
     564                 :       1564 :         need_stat(WDF);
     565                 :       1564 :         need_stat(WDF_MAX);
     566                 :       1564 :         need_stat(COLLECTION_SIZE);
     567                 :       1564 :     }
     568                 :            : 
     569                 :            :     std::string name() const;
     570                 :            : 
     571                 :            :     std::string short_name() const;
     572                 :            : 
     573                 :            :     std::string serialise() const;
     574                 :            :     TfIdfWeight * unserialise(const std::string & serialised) const;
     575                 :            : 
     576                 :            :     double get_sumpart(Xapian::termcount wdf,
     577                 :            :                        Xapian::termcount doclen,
     578                 :            :                        Xapian::termcount uniqterm) const;
     579                 :            :     double get_maxpart() const;
     580                 :            : 
     581                 :            :     double get_sumextra(Xapian::termcount doclen,
     582                 :            :                         Xapian::termcount uniqterms) const;
     583                 :            :     double get_maxextra() const;
     584                 :            : 
     585                 :            :     TfIdfWeight * create_from_parameters(const char * params) const;
     586                 :            : };
     587                 :            : 
     588                 :            : 
     589                 :            : /// Xapian::Weight subclass implementing the BM25 probabilistic formula.
     590         [ -  + ]:    2032838 : class XAPIAN_VISIBILITY_DEFAULT BM25Weight : public Weight {
     591                 :            :     /// Factor to multiply the document length by.
     592                 :            :     mutable Xapian::doclength len_factor;
     593                 :            : 
     594                 :            :     /// Factor combining all the document independent factors.
     595                 :            :     mutable double termweight;
     596                 :            : 
     597                 :            :     /// The BM25 parameters.
     598                 :            :     double param_k1, param_k2, param_k3, param_b;
     599                 :            : 
     600                 :            :     /// The minimum normalised document length value.
     601                 :            :     Xapian::doclength param_min_normlen;
     602                 :            : 
     603                 :            :     BM25Weight * clone() const;
     604                 :            : 
     605                 :            :     void init(double factor);
     606                 :            : 
     607                 :            :   public:
     608                 :            :     /** Construct a BM25Weight.
     609                 :            :      *
     610                 :            :      *  @param k1  A non-negative parameter controlling how influential
     611                 :            :      *             within-document-frequency (wdf) is.  k1=0 means that
     612                 :            :      *             wdf doesn't affect the weights.  The larger k1 is, the more
     613                 :            :      *             wdf influences the weights.  (default 1)
     614                 :            :      *
     615                 :            :      *  @param k2  A non-negative parameter which controls the strength of a
     616                 :            :      *             correction factor which depends upon query length and
     617                 :            :      *             normalised document length.  k2=0 disable this factor; larger
     618                 :            :      *             k2 makes it stronger.  (default 0)
     619                 :            :      *
     620                 :            :      *  @param k3  A non-negative parameter controlling how influential
     621                 :            :      *             within-query-frequency (wqf) is.  k3=0 means that wqf
     622                 :            :      *             doesn't affect the weights.  The larger k3 is, the more
     623                 :            :      *             wqf influences the weights.  (default 1)
     624                 :            :      *
     625                 :            :      *  @param b   A parameter between 0 and 1, controlling how strong the
     626                 :            :      *             document length normalisation of wdf is.  0 means no
     627                 :            :      *             normalisation; 1 means full normalisation.  (default 0.5)
     628                 :            :      *
     629                 :            :      *  @param min_normlen  A parameter specifying a minimum value for
     630                 :            :      *             normalised document length.  Normalised document length
     631                 :            :      *             values less than this will be clamped to this value, helping
     632                 :            :      *             to prevent very short documents getting large weights.
     633                 :            :      *             (default 0.5)
     634                 :            :      */
     635                 :     495720 :     BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
     636                 :            :         : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
     637                 :     495720 :           param_min_normlen(min_normlen)
     638                 :            :     {
     639         [ -  + ]:     495720 :         if (param_k1 < 0) param_k1 = 0;
     640         [ -  + ]:     495720 :         if (param_k2 < 0) param_k2 = 0;
     641         [ -  + ]:     495720 :         if (param_k3 < 0) param_k3 = 0;
     642         [ -  + ]:     495720 :         if (param_b < 0) {
     643                 :          0 :             param_b = 0;
     644         [ -  + ]:     495720 :         } else if (param_b > 1) {
     645                 :          0 :             param_b = 1;
     646                 :            :         }
     647                 :     495720 :         need_stat(COLLECTION_SIZE);
     648                 :     495720 :         need_stat(RSET_SIZE);
     649                 :     495720 :         need_stat(TERMFREQ);
     650                 :     495720 :         need_stat(RELTERMFREQ);
     651                 :     495720 :         need_stat(WDF);
     652                 :     495720 :         need_stat(WDF_MAX);
     653 [ +  + ][ +  + ]:     495720 :         if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
                 [ +  + ]
     654                 :     495624 :             need_stat(DOC_LENGTH_MIN);
     655                 :     495624 :             need_stat(AVERAGE_LENGTH);
     656                 :            :         }
     657 [ +  + ][ +  + ]:     495720 :         if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
     658         [ +  + ]:     495720 :         if (param_k2 != 0) {
     659                 :         66 :             need_stat(DOC_LENGTH);
     660                 :         66 :             need_stat(QUERY_LENGTH);
     661                 :            :         }
     662         [ +  + ]:     495720 :         if (param_k3 != 0) need_stat(WQF);
     663                 :     495720 :     }
     664                 :            : 
     665                 :      12511 :     BM25Weight()
     666                 :            :         : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
     667                 :      12511 :           param_min_normlen(0.5)
     668                 :            :     {
     669                 :      12511 :         need_stat(COLLECTION_SIZE);
     670                 :      12511 :         need_stat(RSET_SIZE);
     671                 :      12511 :         need_stat(TERMFREQ);
     672                 :      12511 :         need_stat(RELTERMFREQ);
     673                 :      12511 :         need_stat(WDF);
     674                 :      12511 :         need_stat(WDF_MAX);
     675                 :      12511 :         need_stat(DOC_LENGTH_MIN);
     676                 :      12511 :         need_stat(AVERAGE_LENGTH);
     677                 :      12511 :         need_stat(DOC_LENGTH);
     678                 :      12511 :         need_stat(WQF);
     679                 :      12511 :     }
     680                 :            : 
     681                 :            :     std::string name() const;
     682                 :            :     std::string short_name() const;
     683                 :            : 
     684                 :            :     std::string serialise() const;
     685                 :            :     BM25Weight * unserialise(const std::string & serialised) const;
     686                 :            : 
     687                 :            :     double get_sumpart(Xapian::termcount wdf,
     688                 :            :                        Xapian::termcount doclen,
     689                 :            :                        Xapian::termcount uniqterm) const;
     690                 :            :     double get_maxpart() const;
     691                 :            : 
     692                 :            :     double get_sumextra(Xapian::termcount doclen,
     693                 :            :                         Xapian::termcount uniqterms) const;
     694                 :            :     double get_maxextra() const;
     695                 :            : 
     696                 :            :     BM25Weight * create_from_parameters(const char * params) const;
     697                 :            : };
     698                 :            : 
     699                 :            : /// Xapian::Weight subclass implementing the BM25+ probabilistic formula.
     700         [ -  + ]:       6594 : class XAPIAN_VISIBILITY_DEFAULT BM25PlusWeight : public Weight {
     701                 :            :     /// Factor to multiply the document length by.
     702                 :            :     mutable Xapian::doclength len_factor;
     703                 :            : 
     704                 :            :     /// Factor combining all the document independent factors.
     705                 :            :     mutable double termweight;
     706                 :            : 
     707                 :            :     /// The BM25+ parameters.
     708                 :            :     double param_k1, param_k2, param_k3, param_b;
     709                 :            : 
     710                 :            :     /// The minimum normalised document length value.
     711                 :            :     Xapian::doclength param_min_normlen;
     712                 :            : 
     713                 :            :     /// Additional parameter delta in the BM25+ formula.
     714                 :            :     double param_delta;
     715                 :            : 
     716                 :            :     BM25PlusWeight * clone() const;
     717                 :            : 
     718                 :            :     void init(double factor);
     719                 :            : 
     720                 :            :   public:
     721                 :            :     /** Construct a BM25PlusWeight.
     722                 :            :      *
     723                 :            :      *  @param k1  A non-negative parameter controlling how influential
     724                 :            :      *             within-document-frequency (wdf) is.  k1=0 means that
     725                 :            :      *             wdf doesn't affect the weights.  The larger k1 is, the more
     726                 :            :      *             wdf influences the weights.  (default 1)
     727                 :            :      *
     728                 :            :      *  @param k2  A non-negative parameter which controls the strength of a
     729                 :            :      *             correction factor which depends upon query length and
     730                 :            :      *             normalised document length.  k2=0 disable this factor; larger
     731                 :            :      *             k2 makes it stronger.  The paper which describes BM25+
     732                 :            :      *             ignores BM25's document-independent component (so implicitly
     733                 :            :      *             k2=0), but we support non-zero k2 too.  (default 0)
     734                 :            :      *
     735                 :            :      *  @param k3  A non-negative parameter controlling how influential
     736                 :            :      *             within-query-frequency (wqf) is.  k3=0 means that wqf
     737                 :            :      *             doesn't affect the weights.  The larger k3 is, the more
     738                 :            :      *             wqf influences the weights.  (default 1)
     739                 :            :      *
     740                 :            :      *  @param b   A parameter between 0 and 1, controlling how strong the
     741                 :            :      *             document length normalisation of wdf is.  0 means no
     742                 :            :      *             normalisation; 1 means full normalisation.  (default 0.5)
     743                 :            :      *
     744                 :            :      *  @param min_normlen  A parameter specifying a minimum value for
     745                 :            :      *             normalised document length.  Normalised document length
     746                 :            :      *             values less than this will be clamped to this value, helping
     747                 :            :      *             to prevent very short documents getting large weights.
     748                 :            :      *             (default 0.5)
     749                 :            :      *
     750                 :            :      *  @param delta  A parameter for pseudo tf value to control the scale
     751                 :            :      *                of the tf lower bound. Delta(δ) can be tuned for example
     752                 :            :      *                from 0.0 to 1.5 but BM25+ can still work effectively
     753                 :            :      *                across collections with a fixed δ = 1.0. (default 1.0)
     754                 :            :      */
     755                 :        100 :     BM25PlusWeight(double k1, double k2, double k3, double b,
     756                 :            :                    double min_normlen, double delta)
     757                 :            :         : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
     758                 :        100 :           param_min_normlen(min_normlen), param_delta(delta)
     759                 :            :     {
     760         [ -  + ]:        100 :         if (param_k1 < 0) param_k1 = 0;
     761         [ -  + ]:        100 :         if (param_k2 < 0) param_k2 = 0;
     762         [ -  + ]:        100 :         if (param_k3 < 0) param_k3 = 0;
     763         [ -  + ]:        100 :         if (param_delta < 0) param_delta = 0;
     764         [ -  + ]:        100 :         if (param_b < 0) {
     765                 :          0 :             param_b = 0;
     766         [ -  + ]:        100 :         } else if (param_b > 1) {
     767                 :          0 :             param_b = 1;
     768                 :            :         }
     769                 :        100 :         need_stat(COLLECTION_SIZE);
     770                 :        100 :         need_stat(RSET_SIZE);
     771                 :        100 :         need_stat(TERMFREQ);
     772                 :        100 :         need_stat(RELTERMFREQ);
     773                 :        100 :         need_stat(WDF);
     774                 :        100 :         need_stat(WDF_MAX);
     775 [ +  + ][ +  + ]:        100 :         if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
                 [ +  + ]
     776                 :         36 :             need_stat(DOC_LENGTH_MIN);
     777                 :         36 :             need_stat(AVERAGE_LENGTH);
     778                 :            :         }
     779 [ +  + ][ +  + ]:        100 :         if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
     780         [ +  + ]:        100 :         if (param_k2 != 0) {
     781                 :          1 :             need_stat(DOC_LENGTH);
     782                 :          1 :             need_stat(QUERY_LENGTH);
     783                 :            :         }
     784         [ +  - ]:        100 :         if (param_k3 != 0) need_stat(WQF);
     785                 :        100 :     }
     786                 :            : 
     787                 :       1562 :     BM25PlusWeight()
     788                 :            :         : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
     789                 :       1562 :           param_min_normlen(0.5), param_delta(1)
     790                 :            :     {
     791                 :       1562 :         need_stat(COLLECTION_SIZE);
     792                 :       1562 :         need_stat(RSET_SIZE);
     793                 :       1562 :         need_stat(TERMFREQ);
     794                 :       1562 :         need_stat(RELTERMFREQ);
     795                 :       1562 :         need_stat(WDF);
     796                 :       1562 :         need_stat(WDF_MAX);
     797                 :       1562 :         need_stat(DOC_LENGTH_MIN);
     798                 :       1562 :         need_stat(AVERAGE_LENGTH);
     799                 :       1562 :         need_stat(DOC_LENGTH);
     800                 :       1562 :         need_stat(WQF);
     801                 :       1562 :     }
     802                 :            : 
     803                 :            :     std::string name() const;
     804                 :            :     std::string short_name() const;
     805                 :            : 
     806                 :            :     std::string serialise() const;
     807                 :            :     BM25PlusWeight * unserialise(const std::string & serialised) const;
     808                 :            : 
     809                 :            :     double get_sumpart(Xapian::termcount wdf,
     810                 :            :                        Xapian::termcount doclen,
     811                 :            :                        Xapian::termcount uniqterm) const;
     812                 :            :     double get_maxpart() const;
     813                 :            : 
     814                 :            :     double get_sumextra(Xapian::termcount doclen,
     815                 :            :                         Xapian::termcount uniqterms) const;
     816                 :            :     double get_maxextra() const;
     817                 :            : 
     818                 :            :     BM25PlusWeight * create_from_parameters(const char * params) const;
     819                 :            : };
     820                 :            : 
     821                 :            : /** Xapian::Weight subclass implementing the traditional probabilistic formula.
     822                 :            :  *
     823                 :            :  * This class implements the "traditional" Probabilistic Weighting scheme, as
     824                 :            :  * described by the early papers on Probabilistic Retrieval.  BM25 generally
     825                 :            :  * gives better results.
     826                 :            :  *
     827                 :            :  * TradWeight(k) is equivalent to BM25Weight(k, 0, 0, 1, 0), except that
     828                 :            :  * the latter returns weights (k+1) times larger.
     829                 :            :  */
     830         [ -  + ]:       6846 : class XAPIAN_VISIBILITY_DEFAULT TradWeight : public Weight {
     831                 :            :     /// Factor to multiply the document length by.
     832                 :            :     mutable Xapian::doclength len_factor;
     833                 :            : 
     834                 :            :     /// Factor combining all the document independent factors.
     835                 :            :     mutable double termweight;
     836                 :            : 
     837                 :            :     /// The parameter in the formula.
     838                 :            :     double param_k;
     839                 :            : 
     840                 :            :     TradWeight * clone() const;
     841                 :            : 
     842                 :            :     void init(double factor);
     843                 :            : 
     844                 :            :   public:
     845                 :            :     /** Construct a TradWeight.
     846                 :            :      *
     847                 :            :      *  @param k  A non-negative parameter controlling how influential
     848                 :            :      *            within-document-frequency (wdf) and document length are.
     849                 :            :      *            k=0 means that wdf and document length don't affect the
     850                 :            :      *            weights.  The larger k is, the more they do.  (default 1)
     851                 :            :      */
     852                 :       3458 :     explicit TradWeight(double k = 1.0) : param_k(k) {
     853         [ -  + ]:       1729 :         if (param_k < 0) param_k = 0;
     854         [ +  + ]:       1729 :         if (param_k != 0.0) {
     855                 :       1665 :             need_stat(AVERAGE_LENGTH);
     856                 :       1665 :             need_stat(DOC_LENGTH);
     857                 :            :         }
     858                 :       1729 :         need_stat(COLLECTION_SIZE);
     859                 :       1729 :         need_stat(RSET_SIZE);
     860                 :       1729 :         need_stat(TERMFREQ);
     861                 :       1729 :         need_stat(RELTERMFREQ);
     862                 :       1729 :         need_stat(DOC_LENGTH_MIN);
     863                 :       1729 :         need_stat(WDF);
     864                 :       1729 :         need_stat(WDF_MAX);
     865                 :       1729 :     }
     866                 :            : 
     867                 :            :     std::string name() const;
     868                 :            :     std::string short_name() const;
     869                 :            : 
     870                 :            :     std::string serialise() const;
     871                 :            :     TradWeight * unserialise(const std::string & serialised) const;
     872                 :            : 
     873                 :            :     double get_sumpart(Xapian::termcount wdf,
     874                 :            :                        Xapian::termcount doclen,
     875                 :            :                        Xapian::termcount uniqueterms) const;
     876                 :            :     double get_maxpart() const;
     877                 :            : 
     878                 :            :     double get_sumextra(Xapian::termcount doclen,
     879                 :            :                         Xapian::termcount uniqterms) const;
     880                 :            :     double get_maxextra() const;
     881                 :            : 
     882                 :            :     TradWeight * create_from_parameters(const char * params) const;
     883                 :            : };
     884                 :            : 
     885                 :            : /** This class implements the InL2 weighting scheme.
     886                 :            :  *
     887                 :            :  *  InL2 is a representative scheme of the Divergence from Randomness Framework
     888                 :            :  *  by Gianni Amati.
     889                 :            :  *
     890                 :            :  *  This weighting scheme is useful for tasks that require early precision.
     891                 :            :  *
     892                 :            :  *  It uses the Inverse document frequency model (In), the Laplace method to
     893                 :            :  *  find the aftereffect of sampling (L) and the second wdf normalization
     894                 :            :  *  proposed by Amati to normalize the wdf in the document to the length of the
     895                 :            :  *  document (H2).
     896                 :            :  *
     897                 :            :  *  For more information about the DFR Framework and the InL2 scheme, please
     898                 :            :  *  refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
     899                 :            :  *  models of information retrieval based on measuring the divergence from
     900                 :            :  *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
     901                 :            :  *  pp. 357-389.
     902                 :            :  */
     903         [ -  + ]:       6486 : class XAPIAN_VISIBILITY_DEFAULT InL2Weight : public Weight {
     904                 :            :     /// The wdf normalization parameter in the formula.
     905                 :            :     double param_c;
     906                 :            : 
     907                 :            :     /// The upper bound on the weight a term can give to a document.
     908                 :            :     double upper_bound;
     909                 :            : 
     910                 :            :     /// The constant values which are used on every call to get_sumpart().
     911                 :            :     double wqf_product_idf;
     912                 :            :     double c_product_avlen;
     913                 :            : 
     914                 :            :     InL2Weight * clone() const;
     915                 :            : 
     916                 :            :     void init(double factor);
     917                 :            : 
     918                 :            :   public:
     919                 :            :     /** Construct an InL2Weight.
     920                 :            :      *
     921                 :            :      *  @param c  A non-negative and non zero parameter controlling the extent
     922                 :            :      *            of the normalization of the wdf to the document length. The
     923                 :            :      *            default value of 1 is suitable for longer queries but it may
     924                 :            :      *            need to be changed for shorter queries. For more information,
     925                 :            :      *            please refer to Gianni Amati's PHD thesis.
     926                 :            :      */
     927                 :            :     explicit InL2Weight(double c);
     928                 :            : 
     929                 :       1564 :     InL2Weight()
     930                 :       1564 :     : param_c(1.0)
     931                 :            :     {
     932                 :       1564 :         need_stat(AVERAGE_LENGTH);
     933                 :       1564 :         need_stat(DOC_LENGTH);
     934                 :       1564 :         need_stat(DOC_LENGTH_MIN);
     935                 :       1564 :         need_stat(DOC_LENGTH_MAX);
     936                 :       1564 :         need_stat(COLLECTION_SIZE);
     937                 :       1564 :         need_stat(WDF);
     938                 :       1564 :         need_stat(WDF_MAX);
     939                 :       1564 :         need_stat(WQF);
     940                 :       1564 :         need_stat(TERMFREQ);
     941                 :       1564 :     }
     942                 :            : 
     943                 :            :     std::string name() const;
     944                 :            :     std::string short_name() const;
     945                 :            : 
     946                 :            :     std::string serialise() const;
     947                 :            :     InL2Weight * unserialise(const std::string & serialised) const;
     948                 :            : 
     949                 :            :     double get_sumpart(Xapian::termcount wdf,
     950                 :            :                        Xapian::termcount doclen,
     951                 :            :                        Xapian::termcount uniqterms) const;
     952                 :            :     double get_maxpart() const;
     953                 :            : 
     954                 :            :     double get_sumextra(Xapian::termcount doclen,
     955                 :            :                         Xapian::termcount uniqterms) const;
     956                 :            :     double get_maxextra() const;
     957                 :            : 
     958                 :            :     InL2Weight * create_from_parameters(const char * params) const;
     959                 :            : };
     960                 :            : 
     961                 :            : /** This class implements the IfB2 weighting scheme.
     962                 :            :  *
     963                 :            :  *  IfB2 is a representative scheme of the Divergence from Randomness Framework
     964                 :            :  *  by Gianni Amati.
     965                 :            :  *
     966                 :            :  *  It uses the Inverse term frequency model (If), the Bernoulli method to find
     967                 :            :  *  the aftereffect of sampling (B) and the second wdf normalization proposed
     968                 :            :  *  by Amati to normalize the wdf in the document to the length of the document
     969                 :            :  *  (H2).
     970                 :            :  *
     971                 :            :  *  For more information about the DFR Framework and the IfB2 scheme, please
     972                 :            :  *  refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
     973                 :            :  *  models of information retrieval based on measuring the divergence from
     974                 :            :  *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
     975                 :            :  *  pp. 357-389.
     976                 :            :  */
     977         [ -  + ]:       6486 : class XAPIAN_VISIBILITY_DEFAULT IfB2Weight : public Weight {
     978                 :            :     /// The wdf normalization parameter in the formula.
     979                 :            :     double param_c;
     980                 :            : 
     981                 :            :     /// The upper bound on the weight.
     982                 :            :     double upper_bound;
     983                 :            : 
     984                 :            :     /// The constant values which are used for calculations in get_sumpart().
     985                 :            :     double wqf_product_idf;
     986                 :            :     double c_product_avlen;
     987                 :            :     double B_constant;
     988                 :            : 
     989                 :            :     IfB2Weight * clone() const;
     990                 :            : 
     991                 :            :     void init(double factor);
     992                 :            : 
     993                 :            :   public:
     994                 :            :     /** Construct an IfB2Weight.
     995                 :            :      *
     996                 :            :      *  @param c  A non-negative and non zero parameter controlling the extent
     997                 :            :      *            of the normalization of the wdf to the document length. The
     998                 :            :      *            default value of 1 is suitable for longer queries but it may
     999                 :            :      *            need to be changed for shorter queries. For more information,
    1000                 :            :      *            please refer to Gianni Amati's PHD thesis titled
    1001                 :            :      *            Probabilistic Models for Information Retrieval based on
    1002                 :            :      *            Divergence from Randomness.
    1003                 :            :      */
    1004                 :            :     explicit IfB2Weight(double c);
    1005                 :            : 
    1006                 :       3128 :     IfB2Weight() : param_c(1.0) {
    1007                 :       1564 :         need_stat(AVERAGE_LENGTH);
    1008                 :       1564 :         need_stat(DOC_LENGTH);
    1009                 :       1564 :         need_stat(DOC_LENGTH_MIN);
    1010                 :       1564 :         need_stat(DOC_LENGTH_MAX);
    1011                 :       1564 :         need_stat(COLLECTION_SIZE);
    1012                 :       1564 :         need_stat(COLLECTION_FREQ);
    1013                 :       1564 :         need_stat(WDF);
    1014                 :       1564 :         need_stat(WDF_MAX);
    1015                 :       1564 :         need_stat(WQF);
    1016                 :       1564 :         need_stat(TERMFREQ);
    1017                 :       1564 :     }
    1018                 :            : 
    1019                 :            :     std::string name() const;
    1020                 :            :     std::string short_name() const;
    1021                 :            : 
    1022                 :            :     std::string serialise() const;
    1023                 :            :     IfB2Weight * unserialise(const std::string & serialised) const;
    1024                 :            : 
    1025                 :            :     double get_sumpart(Xapian::termcount wdf,
    1026                 :            :                        Xapian::termcount doclen,
    1027                 :            :                        Xapian::termcount uniqterm) const;
    1028                 :            :     double get_maxpart() const;
    1029                 :            : 
    1030                 :            :     double get_sumextra(Xapian::termcount doclen,
    1031                 :            :                         Xapian::termcount uniqterms) const;
    1032                 :            :     double get_maxextra() const;
    1033                 :            : 
    1034                 :            :     IfB2Weight * create_from_parameters(const char * params) const;
    1035                 :            : };
    1036                 :            : 
    1037                 :            : /** This class implements the IneB2 weighting scheme.
    1038                 :            :  *
    1039                 :            :  *  IneB2 is a representative scheme of the Divergence from Randomness
    1040                 :            :  *  Framework by Gianni Amati.
    1041                 :            :  *
    1042                 :            :  *  It uses the Inverse expected document frequency model (Ine), the Bernoulli
    1043                 :            :  *  method to find the aftereffect of sampling (B) and the second wdf
    1044                 :            :  *  normalization proposed by Amati to normalize the wdf in the document to the
    1045                 :            :  *  length of the document (H2).
    1046                 :            :  *
    1047                 :            :  *  For more information about the DFR Framework and the IneB2 scheme, please
    1048                 :            :  *  refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
    1049                 :            :  *  models of information retrieval based on measuring the divergence from
    1050                 :            :  *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
    1051                 :            :  *  pp. 357-389.
    1052                 :            :  */
    1053         [ -  + ]:       6486 : class XAPIAN_VISIBILITY_DEFAULT IneB2Weight : public Weight {
    1054                 :            :     /// The wdf normalization parameter in the formula.
    1055                 :            :     double param_c;
    1056                 :            : 
    1057                 :            :     /// The upper bound of the weight.
    1058                 :            :     double upper_bound;
    1059                 :            : 
    1060                 :            :     /// Constant values used in get_sumpart().
    1061                 :            :     double wqf_product_idf;
    1062                 :            :     double c_product_avlen;
    1063                 :            :     double B_constant;
    1064                 :            : 
    1065                 :            :     IneB2Weight * clone() const;
    1066                 :            : 
    1067                 :            :     void init(double factor);
    1068                 :            : 
    1069                 :            :   public:
    1070                 :            :     /** Construct an IneB2Weight.
    1071                 :            :      *
    1072                 :            :      *  @param c  A non-negative and non zero parameter controlling the extent
    1073                 :            :      *            of the normalization of the wdf to the document length. The
    1074                 :            :      *            default value of 1 is suitable for longer queries but it may
    1075                 :            :      *            need to be changed for shorter queries. For more information,
    1076                 :            :      *            please refer to Gianni Amati's PHD thesis.
    1077                 :            :      */
    1078                 :            :     explicit IneB2Weight(double c);
    1079                 :            : 
    1080                 :       3128 :     IneB2Weight() : param_c(1.0) {
    1081                 :       1564 :         need_stat(AVERAGE_LENGTH);
    1082                 :       1564 :         need_stat(DOC_LENGTH);
    1083                 :       1564 :         need_stat(DOC_LENGTH_MIN);
    1084                 :       1564 :         need_stat(DOC_LENGTH_MAX);
    1085                 :       1564 :         need_stat(COLLECTION_SIZE);
    1086                 :       1564 :         need_stat(WDF);
    1087                 :       1564 :         need_stat(WDF_MAX);
    1088                 :       1564 :         need_stat(WQF);
    1089                 :       1564 :         need_stat(COLLECTION_FREQ);
    1090                 :       1564 :         need_stat(TERMFREQ);
    1091                 :       1564 :     }
    1092                 :            : 
    1093                 :            :     std::string name() const;
    1094                 :            :     std::string short_name() const;
    1095                 :            : 
    1096                 :            :     std::string serialise() const;
    1097                 :            :     IneB2Weight * unserialise(const std::string & serialised) const;
    1098                 :            : 
    1099                 :            :     double get_sumpart(Xapian::termcount wdf,
    1100                 :            :                        Xapian::termcount doclen,
    1101                 :            :                        Xapian::termcount uniqterms) const;
    1102                 :            :     double get_maxpart() const;
    1103                 :            : 
    1104                 :            :     double get_sumextra(Xapian::termcount doclen,
    1105                 :            :                         Xapian::termcount uniqterms) const;
    1106                 :            :     double get_maxextra() const;
    1107                 :            : 
    1108                 :            :     IneB2Weight * create_from_parameters(const char * params) const;
    1109                 :            : };
    1110                 :            : 
    1111                 :            : /** This class implements the BB2 weighting scheme.
    1112                 :            :  *
    1113                 :            :  *  BB2 is a representative scheme of the Divergence from Randomness Framework
    1114                 :            :  *  by Gianni Amati.
    1115                 :            :  *
    1116                 :            :  *  It uses the Bose-Einstein probabilistic distribution (B) along with
    1117                 :            :  *  Stirling's power approximation, the Bernoulli method to find the
    1118                 :            :  *  aftereffect of sampling (B) and the second wdf normalization proposed by
    1119                 :            :  *  Amati to normalize the wdf in the document to the length of the document
    1120                 :            :  *  (H2).
    1121                 :            :  *
    1122                 :            :  *  For more information about the DFR Framework and the BB2 scheme, please
    1123                 :            :  *  refer to : Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
    1124                 :            :  *  models of information retrieval based on measuring the divergence from
    1125                 :            :  *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
    1126                 :            :  *  pp. 357-389.
    1127                 :            :  */
    1128         [ -  + ]:       6714 : class XAPIAN_VISIBILITY_DEFAULT BB2Weight : public Weight {
    1129                 :            :     /// The wdf normalization parameter in the formula.
    1130                 :            :     double param_c;
    1131                 :            : 
    1132                 :            :     /// The upper bound on the weight.
    1133                 :            :     double upper_bound;
    1134                 :            : 
    1135                 :            :     /// The constant values to be used in get_sumpart().
    1136                 :            :     double c_product_avlen;
    1137                 :            :     double B_constant;
    1138                 :            :     double wt;
    1139                 :            :     double stirling_constant_1;
    1140                 :            :     double stirling_constant_2;
    1141                 :            : 
    1142                 :            :     BB2Weight * clone() const;
    1143                 :            : 
    1144                 :            :     void init(double factor);
    1145                 :            : 
    1146                 :            :   public:
    1147                 :            :     /** Construct a BB2Weight.
    1148                 :            :      *
    1149                 :            :      *  @param c  A non-negative and non zero parameter controlling the extent
    1150                 :            :      *            of the normalization of the wdf to the document length. A
    1151                 :            :      *            default value of 1 is suitable for longer queries but it may
    1152                 :            :      *            need to be changed for shorter queries. For more information,
    1153                 :            :      *            please refer to Gianni Amati's PHD thesis titled
    1154                 :            :      *            Probabilistic Models for Information Retrieval based on
    1155                 :            :      *            Divergence from Randomness.
    1156                 :            :      */
    1157                 :            :     explicit BB2Weight(double c);
    1158                 :            : 
    1159                 :       3142 :     BB2Weight() : param_c(1.0) {
    1160                 :       1571 :         need_stat(AVERAGE_LENGTH);
    1161                 :       1571 :         need_stat(DOC_LENGTH);
    1162                 :       1571 :         need_stat(DOC_LENGTH_MIN);
    1163                 :       1571 :         need_stat(DOC_LENGTH_MAX);
    1164                 :       1571 :         need_stat(COLLECTION_SIZE);
    1165                 :       1571 :         need_stat(COLLECTION_FREQ);
    1166                 :       1571 :         need_stat(WDF);
    1167                 :       1571 :         need_stat(WDF_MAX);
    1168                 :       1571 :         need_stat(WQF);
    1169                 :       1571 :         need_stat(TERMFREQ);
    1170                 :       1571 :     }
    1171                 :            : 
    1172                 :            :     std::string name() const;
    1173                 :            :     std::string short_name() const;
    1174                 :            : 
    1175                 :            :     std::string serialise() const;
    1176                 :            :     BB2Weight * unserialise(const std::string & serialised) const;
    1177                 :            : 
    1178                 :            :     double get_sumpart(Xapian::termcount wdf,
    1179                 :            :                        Xapian::termcount doclen,
    1180                 :            :                        Xapian::termcount uniqterms) const;
    1181                 :            :     double get_maxpart() const;
    1182                 :            : 
    1183                 :            :     double get_sumextra(Xapian::termcount doclen,
    1184                 :            :                         Xapian::termcount uniqterms) const;
    1185                 :            :     double get_maxextra() const;
    1186                 :            : 
    1187                 :            :     BB2Weight * create_from_parameters(const char * params) const;
    1188                 :            : };
    1189                 :            : 
    1190                 :            : /** This class implements the DLH weighting scheme, which is a representative
    1191                 :            :  *  scheme of the Divergence from Randomness Framework by Gianni Amati.
    1192                 :            :  *
    1193                 :            :  *  This is a parameter free weighting scheme and it should be used with query
    1194                 :            :  *  expansion to obtain better results. It uses the HyperGeometric Probabilistic
    1195                 :            :  *  model and Laplace's normalization to calculate the risk gain.
    1196                 :            :  *
    1197                 :            :  *  For more information about the DFR Framework and the DLH scheme, please
    1198                 :            :  *  refer to :
    1199                 :            :  *  a.) Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
    1200                 :            :  *  models of information retrieval based on measuring the divergence from
    1201                 :            :  *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002, pp.
    1202                 :            :  *  357-389.
    1203                 :            :  *  b.) FUB, IASI-CNR and University of Tor Vergata at TREC 2007 Blog Track.
    1204                 :            :  *  G. Amati and E. Ambrosi and M. Bianchi and C. Gaibisso and G. Gambosi.
    1205                 :            :  *  Proceedings of the 16th Text REtrieval Conference (TREC-2007), 2008.
    1206                 :            :  */
    1207         [ -  + ]:       6578 : class XAPIAN_VISIBILITY_DEFAULT DLHWeight : public Weight {
    1208                 :            :     /// The upper bound on the weight.
    1209                 :            :     double upper_bound;
    1210                 :            : 
    1211                 :            :     /// The constant value to be used in get_sumpart().
    1212                 :            :     double log_constant;
    1213                 :            :     double wqf_product_factor;
    1214                 :            : 
    1215                 :            :     DLHWeight * clone() const;
    1216                 :            : 
    1217                 :            :     void init(double factor);
    1218                 :            : 
    1219                 :            :   public:
    1220                 :       3314 :     DLHWeight() {
    1221                 :       1657 :         need_stat(AVERAGE_LENGTH);
    1222                 :       1657 :         need_stat(DOC_LENGTH);
    1223                 :       1657 :         need_stat(COLLECTION_SIZE);
    1224                 :       1657 :         need_stat(COLLECTION_FREQ);
    1225                 :       1657 :         need_stat(WDF);
    1226                 :       1657 :         need_stat(WQF);
    1227                 :       1657 :         need_stat(WDF_MAX);
    1228                 :       1657 :         need_stat(DOC_LENGTH_MIN);
    1229                 :       1657 :         need_stat(DOC_LENGTH_MAX);
    1230                 :       1657 :     }
    1231                 :            : 
    1232                 :            :     std::string name() const;
    1233                 :            :     std::string short_name() const;
    1234                 :            : 
    1235                 :            :     std::string serialise() const;
    1236                 :            :     DLHWeight * unserialise(const std::string & serialised) const;
    1237                 :            : 
    1238                 :            :     double get_sumpart(Xapian::termcount wdf,
    1239                 :            :                        Xapian::termcount doclen,
    1240                 :            :                        Xapian::termcount uniqterms) const;
    1241                 :            :     double get_maxpart() const;
    1242                 :            : 
    1243                 :            :     double get_sumextra(Xapian::termcount doclen,
    1244                 :            :                         Xapian::termcount uniqterms) const;
    1245                 :            :     double get_maxextra() const;
    1246                 :            : 
    1247                 :            :     DLHWeight * create_from_parameters(const char * params) const;
    1248                 :            : };
    1249                 :            : 
    1250                 :            : /** This class implements the PL2 weighting scheme.
    1251                 :            :  *
    1252                 :            :  *  PL2 is a representative scheme of the Divergence from Randomness Framework
    1253                 :            :  *  by Gianni Amati.
    1254                 :            :  *
    1255                 :            :  *  This weighting scheme is useful for tasks that require early precision.
    1256                 :            :  *
    1257                 :            :  *  It uses the Poisson approximation of the Binomial Probabilistic distribution
    1258                 :            :  *  (P) along with Stirling's approximation for the factorial value, the Laplace
    1259                 :            :  *  method to find the aftereffect of sampling (L) and the second wdf
    1260                 :            :  *  normalization proposed by Amati to normalize the wdf in the document to the
    1261                 :            :  *  length of the document (H2).
    1262                 :            :  *
    1263                 :            :  *  For more information about the DFR Framework and the PL2 scheme, please
    1264                 :            :  *  refer to : Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic models
    1265                 :            :  *  of information retrieval based on measuring the divergence from randomness
    1266                 :            :  *  ACM Transactions on Information Systems (TOIS) 20, (4), 2002, pp. 357-389.
    1267                 :            :  */
    1268         [ -  + ]:       6484 : class XAPIAN_VISIBILITY_DEFAULT PL2Weight : public Weight {
    1269                 :            :     /// The factor to multiply weights by.
    1270                 :            :     double factor;
    1271                 :            : 
    1272                 :            :     /// The wdf normalization parameter in the formula.
    1273                 :            :     double param_c;
    1274                 :            : 
    1275                 :            :     /// The upper bound on the weight.
    1276                 :            :     double upper_bound;
    1277                 :            : 
    1278                 :            :     /// Constants for a given term in a given query.
    1279                 :            :     double P1, P2;
    1280                 :            : 
    1281                 :            :     /// Set by init() to (param_c * get_average_length())
    1282                 :            :     double cl;
    1283                 :            : 
    1284                 :            :     PL2Weight * clone() const;
    1285                 :            : 
    1286                 :            :     void init(double factor_);
    1287                 :            : 
    1288                 :            :   public:
    1289                 :            :     /** Construct a PL2Weight.
    1290                 :            :      *
    1291                 :            :      *  @param c  A non-negative and non zero parameter controlling the extent
    1292                 :            :      *            of the normalization of the wdf to the document length. The
    1293                 :            :      *            default value of 1 is suitable for longer queries but it may
    1294                 :            :      *            need to be changed for shorter queries. For more information,
    1295                 :            :      *            please refer to Gianni Amati's PHD thesis titled
    1296                 :            :      *            Probabilistic Models for Information Retrieval based on
    1297                 :            :      *            Divergence from Randomness.
    1298                 :            :      */
    1299                 :            :     explicit PL2Weight(double c);
    1300                 :            : 
    1301                 :       3126 :     PL2Weight() : param_c(1.0) {
    1302                 :       1563 :         need_stat(AVERAGE_LENGTH);
    1303                 :       1563 :         need_stat(DOC_LENGTH);
    1304                 :       1563 :         need_stat(DOC_LENGTH_MIN);
    1305                 :       1563 :         need_stat(DOC_LENGTH_MAX);
    1306                 :       1563 :         need_stat(COLLECTION_SIZE);
    1307                 :       1563 :         need_stat(COLLECTION_FREQ);
    1308                 :       1563 :         need_stat(WDF);
    1309                 :       1563 :         need_stat(WDF_MAX);
    1310                 :       1563 :         need_stat(WQF);
    1311                 :       1563 :     }
    1312                 :            : 
    1313                 :            :     std::string name() const;
    1314                 :            :     std::string short_name() const;
    1315                 :            : 
    1316                 :            :     std::string serialise() const;
    1317                 :            :     PL2Weight * unserialise(const std::string & serialised) const;
    1318                 :            : 
    1319                 :            :     double get_sumpart(Xapian::termcount wdf,
    1320                 :            :                        Xapian::termcount doclen,
    1321                 :            :                        Xapian::termcount uniqterms) const;
    1322                 :            :     double get_maxpart() const;
    1323                 :            : 
    1324                 :            :     double get_sumextra(Xapian::termcount doclen,
    1325                 :            :                         Xapian::termcount uniqterms) const;
    1326                 :            :     double get_maxextra() const;
    1327                 :            : 
    1328                 :            :     PL2Weight * create_from_parameters(const char * params) const;
    1329                 :            : };
    1330                 :            : 
    1331                 :            : /// Xapian::Weight subclass implementing the PL2+ probabilistic formula.
    1332         [ -  + ]:       6600 : class XAPIAN_VISIBILITY_DEFAULT PL2PlusWeight : public Weight {
    1333                 :            :     /// The factor to multiply weights by.
    1334                 :            :     double factor;
    1335                 :            : 
    1336                 :            :     /// The wdf normalization parameter in the formula.
    1337                 :            :     double param_c;
    1338                 :            : 
    1339                 :            :     /// Additional parameter delta in the PL2+ weighting formula.
    1340                 :            :     double param_delta;
    1341                 :            : 
    1342                 :            :     /// The upper bound on the weight.
    1343                 :            :     double upper_bound;
    1344                 :            : 
    1345                 :            :     /// Constants for a given term in a given query.
    1346                 :            :     double P1, P2;
    1347                 :            : 
    1348                 :            :     /// Set by init() to (param_c * get_average_length())
    1349                 :            :     double cl;
    1350                 :            : 
    1351                 :            :     /// Set by init() to get_collection_freq()) / get_collection_size()
    1352                 :            :     double mean;
    1353                 :            : 
    1354                 :            :     /// Weight contribution of delta term in the PL2+ function
    1355                 :            :     double dw;
    1356                 :            : 
    1357                 :            :     PL2PlusWeight * clone() const;
    1358                 :            : 
    1359                 :            :     void init(double factor_);
    1360                 :            : 
    1361                 :            :   public:
    1362                 :            :     /** Construct a PL2PlusWeight.
    1363                 :            :      *
    1364                 :            :      *  @param c  A non-negative and non zero parameter controlling the extent
    1365                 :            :      *            of the normalization of the wdf to the document length. The
    1366                 :            :      *            default value of 1 is suitable for longer queries but it may
    1367                 :            :      *            need to be changed for shorter queries. For more information,
    1368                 :            :      *            please refer to Gianni Amati's PHD thesis titled
    1369                 :            :      *            Probabilistic Models for Information Retrieval based on
    1370                 :            :      *            Divergence from Randomness.
    1371                 :            :      *
    1372                 :            :      *  @param delta  A parameter for pseudo tf value to control the scale
    1373                 :            :      *                of the tf lower bound. Delta(δ) should be a positive
    1374                 :            :      *                real number. It can be tuned for example from 0.1 to 1.5
    1375                 :            :      *                in increments of 0.1 or so. Experiments have shown that
    1376                 :            :      *                PL2+ works effectively across collections with a fixed δ = 0.8
    1377                 :            :      *                (default 0.8)
    1378                 :            :      */
    1379                 :            :     PL2PlusWeight(double c, double delta);
    1380                 :            : 
    1381                 :       1563 :     PL2PlusWeight()
    1382                 :       1563 :         : param_c(1.0), param_delta(0.8) {
    1383                 :       1563 :         need_stat(AVERAGE_LENGTH);
    1384                 :       1563 :         need_stat(DOC_LENGTH);
    1385                 :       1563 :         need_stat(DOC_LENGTH_MIN);
    1386                 :       1563 :         need_stat(DOC_LENGTH_MAX);
    1387                 :       1563 :         need_stat(COLLECTION_SIZE);
    1388                 :       1563 :         need_stat(COLLECTION_FREQ);
    1389                 :       1563 :         need_stat(WDF);
    1390                 :       1563 :         need_stat(WDF_MAX);
    1391                 :       1563 :         need_stat(WQF);
    1392                 :       1563 :     }
    1393                 :            : 
    1394                 :            :     std::string name() const;
    1395                 :            :     std::string short_name() const;
    1396                 :            : 
    1397                 :            :     std::string serialise() const;
    1398                 :            :     PL2PlusWeight * unserialise(const std::string & serialised) const;
    1399                 :            : 
    1400                 :            :     double get_sumpart(Xapian::termcount wdf,
    1401                 :            :                        Xapian::termcount doclen,
    1402                 :            :                        Xapian::termcount uniqterms) const;
    1403                 :            :     double get_maxpart() const;
    1404                 :            : 
    1405                 :            :     double get_sumextra(Xapian::termcount doclen,
    1406                 :            :                         Xapian::termcount uniqterms) const;
    1407                 :            :     double get_maxextra() const;
    1408                 :            : 
    1409                 :            :     PL2PlusWeight * create_from_parameters(const char * params) const;
    1410                 :            : };
    1411                 :            : 
    1412                 :            : /** This class implements the DPH weighting scheme.
    1413                 :            :  *
    1414                 :            :  *  DPH is a representative scheme of the Divergence from Randomness Framework
    1415                 :            :  *  by Gianni Amati.
    1416                 :            :  *
    1417                 :            :  *  This is a parameter free weighting scheme and it should be used with query
    1418                 :            :  *  expansion to obtain better results. It uses the HyperGeometric Probabilistic
    1419                 :            :  *  model and Popper's normalization to calculate the risk gain.
    1420                 :            :  *
    1421                 :            :  *  For more information about the DFR Framework and the DPH scheme, please
    1422                 :            :  *  refer to :
    1423                 :            :  *  a.) Gianni Amati and Cornelis Joost Van Rijsbergen
    1424                 :            :  *  Probabilistic models of information retrieval based on measuring the
    1425                 :            :  *  divergence from randomness ACM Transactions on Information Systems (TOIS) 20,
    1426                 :            :  *  (4), 2002, pp. 357-389.
    1427                 :            :  *  b.) FUB, IASI-CNR and University of Tor Vergata at TREC 2007 Blog Track.
    1428                 :            :  *  G. Amati and E. Ambrosi and M. Bianchi and C. Gaibisso and G. Gambosi.
    1429                 :            :  *  Proceedings of the 16th Text Retrieval Conference (TREC-2007), 2008.
    1430                 :            :  */
    1431         [ -  + ]:       6576 : class XAPIAN_VISIBILITY_DEFAULT DPHWeight : public Weight {
    1432                 :            :     /// The upper bound on the weight.
    1433                 :            :     double upper_bound;
    1434                 :            : 
    1435                 :            :     /// The constant value used in get_sumpart() .
    1436                 :            :     double log_constant;
    1437                 :            :     double wqf_product_factor;
    1438                 :            : 
    1439                 :            :     DPHWeight * clone() const;
    1440                 :            : 
    1441                 :            :     void init(double factor);
    1442                 :            : 
    1443                 :            :   public:
    1444                 :            :     /** Construct a DPHWeight. */
    1445                 :       3312 :     DPHWeight() {
    1446                 :       1656 :         need_stat(AVERAGE_LENGTH);
    1447                 :       1656 :         need_stat(DOC_LENGTH);
    1448                 :       1656 :         need_stat(COLLECTION_SIZE);
    1449                 :       1656 :         need_stat(COLLECTION_FREQ);
    1450                 :       1656 :         need_stat(WDF);
    1451                 :       1656 :         need_stat(WQF);
    1452                 :       1656 :         need_stat(WDF_MAX);
    1453                 :       1656 :         need_stat(DOC_LENGTH_MIN);
    1454                 :       1656 :         need_stat(DOC_LENGTH_MAX);
    1455                 :       1656 :     }
    1456                 :            : 
    1457                 :            :     std::string name() const;
    1458                 :            :     std::string short_name() const;
    1459                 :            : 
    1460                 :            :     std::string serialise() const;
    1461                 :            :     DPHWeight * unserialise(const std::string & serialised) const;
    1462                 :            : 
    1463                 :            :     double get_sumpart(Xapian::termcount wdf,
    1464                 :            :                        Xapian::termcount doclen,
    1465                 :            :                        Xapian::termcount uniqterms) const;
    1466                 :            :     double get_maxpart() const;
    1467                 :            : 
    1468                 :            :     double get_sumextra(Xapian::termcount doclen,
    1469                 :            :                         Xapian::termcount uniqterms) const;
    1470                 :            :     double get_maxextra() const;
    1471                 :            : 
    1472                 :            :     DPHWeight * create_from_parameters(const char * params) const;
    1473                 :            : };
    1474                 :            : 
    1475                 :            : 
    1476                 :            : /** Xapian::Weight subclass implementing the Language Model formula.
    1477                 :            :  *
    1478                 :            :  * This class implements the "Language Model" Weighting scheme, as
    1479                 :            :  * described by the early papers on LM by Bruce Croft.
    1480                 :            :  *
    1481                 :            :  * LM works by comparing the query to a Language Model of the document.
    1482                 :            :  * The language model itself is parameter-free, though LMWeight takes
    1483                 :            :  * parameters which specify the smoothing used.
    1484                 :            :  */
    1485         [ -  + ]:       7508 : class XAPIAN_VISIBILITY_DEFAULT LMWeight : public Weight {
    1486                 :            :     /// The factor to multiply weights by.
    1487                 :            :     double factor;
    1488                 :            : 
    1489                 :            :     /** The type of smoothing to use. */
    1490                 :            :     type_smoothing select_smoothing;
    1491                 :            : 
    1492                 :            :     // Parameters for handling negative value of log, and for smoothing.
    1493                 :            :     double param_log, param_smoothing1, param_smoothing2;
    1494                 :            : 
    1495                 :            :     // Collection weight.
    1496                 :            :     double weight_collection;
    1497                 :            : 
    1498                 :            :     LMWeight * clone() const;
    1499                 :            : 
    1500                 :            :     void init(double factor_);
    1501                 :            : 
    1502                 :            :   public:
    1503                 :            :     /** Construct a LMWeight.
    1504                 :            :      *
    1505                 :            :      *  @param param_log_       A non-negative parameter controlling how much
    1506                 :            :      *                          to clamp negative values returned by the log.
    1507                 :            :      *                          The log is calculated by multiplying the
    1508                 :            :      *                          actual weight by param_log.  If param_log is
    1509                 :            :      *                          0.0, then the document length upper bound will
    1510                 :            :      *                          be used (default: document length upper bound)
    1511                 :            :      *
    1512                 :            :      *  @param select_smoothing_        A parameter of type enum
    1513                 :            :      *                                  type_smoothing.  This parameter
    1514                 :            :      *                                  controls which smoothing type to use.
    1515                 :            :      *                                  (default: TWO_STAGE_SMOOTHING)
    1516                 :            :      *
    1517                 :            :      *  @param param_smoothing1_        A non-negative parameter for smoothing
    1518                 :            :      *                                  whose meaning depends on
    1519                 :            :      *                                  select_smoothing_.  In
    1520                 :            :      *                                  JELINEK_MERCER_SMOOTHING, it plays the
    1521                 :            :      *                                  role of estimation and in
    1522                 :            :      *                                  DIRICHLET_SMOOTHING the role of query
    1523                 :            :      *                                  modelling. (default JELINEK_MERCER,
    1524                 :            :      *                                  ABSOLUTE, TWOSTAGE(0.7),
    1525                 :            :      *                                  DIRCHLET(2000))
    1526                 :            :      *
    1527                 :            :      *  @param param_smoothing2_        A non-negative parameter which is used
    1528                 :            :      *                                  with TWO_STAGE_SMOOTHING as parameter for Dirichlet's
    1529                 :            :      *                                  smoothing (default: 2000) and as parameter delta to
    1530                 :            :      *                                  control the scale of the tf lower bound in the
    1531                 :            :      *                                  DIRICHLET_PLUS_SMOOTHING (default 0.05).
    1532                 :            :      *
    1533                 :            :      */
    1534                 :            :     // Unigram LM Constructor to specifically mention all parameters for handling negative log value and smoothing.
    1535                 :       1919 :     explicit LMWeight(double param_log_ = 0.0,
    1536                 :            :                       type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING,
    1537                 :            :                       double param_smoothing1_ = -1.0,
    1538                 :            :                       double param_smoothing2_ = -1.0)
    1539                 :            :         : select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
    1540                 :       1919 :           param_smoothing2(param_smoothing2_)
    1541                 :            :     {
    1542         [ +  + ]:       1919 :         if (param_smoothing1 < 0) param_smoothing1 = 0.7;
    1543         [ +  + ]:       1919 :         if (param_smoothing2 < 0) {
    1544         [ +  - ]:       1570 :             if (select_smoothing == TWO_STAGE_SMOOTHING)
    1545                 :       1570 :                 param_smoothing2 = 2000.0;
    1546                 :            :             else
    1547                 :       1570 :                 param_smoothing2 = 0.05;
    1548                 :            :         }
    1549                 :       1919 :         need_stat(AVERAGE_LENGTH);
    1550                 :       1919 :         need_stat(DOC_LENGTH);
    1551                 :       1919 :         need_stat(COLLECTION_SIZE);
    1552                 :       1919 :         need_stat(RSET_SIZE);
    1553                 :       1919 :         need_stat(TERMFREQ);
    1554                 :       1919 :         need_stat(RELTERMFREQ);
    1555                 :       1919 :         need_stat(DOC_LENGTH_MAX);
    1556                 :       1919 :         need_stat(WDF);
    1557                 :       1919 :         need_stat(WDF_MAX);
    1558                 :       1919 :         need_stat(COLLECTION_FREQ);
    1559         [ +  + ]:       1919 :         if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING)
    1560                 :         32 :             need_stat(UNIQUE_TERMS);
    1561         [ +  + ]:       1919 :         if (select_smoothing == DIRICHLET_PLUS_SMOOTHING)
    1562                 :         32 :             need_stat(DOC_LENGTH_MIN);
    1563                 :       1919 :     }
    1564                 :            : 
    1565                 :            :     std::string name() const;
    1566                 :            :     std::string short_name() const;
    1567                 :            : 
    1568                 :            :     std::string serialise() const;
    1569                 :            :     LMWeight * unserialise(const std::string & serialised) const;
    1570                 :            : 
    1571                 :            :     double get_sumpart(Xapian::termcount wdf,
    1572                 :            :                        Xapian::termcount doclen,
    1573                 :            :                        Xapian::termcount uniqterm) const;
    1574                 :            :     double get_maxpart() const;
    1575                 :            : 
    1576                 :            :     double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const;
    1577                 :            :     double get_maxextra() const;
    1578                 :            : 
    1579                 :            :     LMWeight * create_from_parameters(const char * params) const;
    1580                 :            : };
    1581                 :            : 
    1582                 :            : /** Xapian::Weight subclass implementing Coordinate Matching.
    1583                 :            :  *
    1584                 :            :  *  Each matching term score one point.  See Managing Gigabytes, Second Edition
    1585                 :            :  *  p181.
    1586                 :            :  */
    1587         [ -  + ]:       6670 : class XAPIAN_VISIBILITY_DEFAULT CoordWeight : public Weight {
    1588                 :            :     /// The factor to multiply weights by.
    1589                 :            :     double factor;
    1590                 :            : 
    1591                 :            :   public:
    1592                 :            :     CoordWeight * clone() const;
    1593                 :            : 
    1594                 :            :     void init(double factor_);
    1595                 :            : 
    1596                 :            :     /** Construct a CoordWeight. */
    1597                 :       3348 :     CoordWeight() { }
    1598                 :            : 
    1599                 :            :     std::string name() const;
    1600                 :            :     std::string short_name() const;
    1601                 :            : 
    1602                 :            :     std::string serialise() const;
    1603                 :            :     CoordWeight * unserialise(const std::string & serialised) const;
    1604                 :            : 
    1605                 :            :     double get_sumpart(Xapian::termcount wdf,
    1606                 :            :                        Xapian::termcount doclen,
    1607                 :            :                        Xapian::termcount uniqterm) const;
    1608                 :            :     double get_maxpart() const;
    1609                 :            : 
    1610                 :            :     double get_sumextra(Xapian::termcount, Xapian::termcount) const;
    1611                 :            :     double get_maxextra() const;
    1612                 :            : 
    1613                 :            :     CoordWeight * create_from_parameters(const char * params) const;
    1614                 :            : };
    1615                 :            : 
    1616                 :            : /** Xapian::Weight subclass implementing Dice Coefficient.
    1617                 :            :  *
    1618                 :            :  *  Dice Coefficient measures the degree of similarity between
    1619                 :            :  *  pair of sets (ex. between two documents or a document and a query).
    1620                 :            :  *
    1621                 :            :  *  Jaccard coefficient and Cosine coefficient are other similarity
    1622                 :            :  *  coefficients.
    1623                 :            :  */
    1624         [ -  + ]:       6540 : class XAPIAN_VISIBILITY_DEFAULT DiceCoeffWeight : public Weight {
    1625                 :            :     /// The factor to multiply weights by.
    1626                 :            :     double factor;
    1627                 :            : 
    1628                 :            :     /// Upper bound on the weight
    1629                 :            :     double upper_bound;
    1630                 :            : 
    1631                 :            :     void init(double factor_);
    1632                 :            : 
    1633                 :            :   public:
    1634                 :            :     DiceCoeffWeight * clone() const;
    1635                 :            : 
    1636                 :            :     /** Construct a DiceCoeffWeight. */
    1637                 :       3288 :     DiceCoeffWeight() {
    1638                 :       1644 :         need_stat(DOC_LENGTH_MIN);
    1639                 :       1644 :         need_stat(QUERY_LENGTH);
    1640                 :       1644 :         need_stat(UNIQUE_TERMS);
    1641                 :       1644 :     }
    1642                 :            : 
    1643                 :            :     std::string name() const;
    1644                 :            :     std::string short_name() const;
    1645                 :            : 
    1646                 :            :     std::string serialise() const;
    1647                 :            :     DiceCoeffWeight * unserialise(const std::string & serialised) const;
    1648                 :            : 
    1649                 :            :     double get_sumpart(Xapian::termcount wdf,
    1650                 :            :                        Xapian::termcount doclen,
    1651                 :            :                        Xapian::termcount uniqterm) const;
    1652                 :            :     double get_maxpart() const;
    1653                 :            : 
    1654                 :            :     double get_sumextra(Xapian::termcount, Xapian::termcount) const;
    1655                 :            :     double get_maxextra() const;
    1656                 :            : 
    1657                 :            :     DiceCoeffWeight * create_from_parameters(const char * params) const;
    1658                 :            : };
    1659                 :            : }
    1660                 :            : 
    1661                 :            : #endif // XAPIAN_INCLUDED_WEIGHT_H

Generated by: LCOV version 1.11