LCOV - code coverage report
Current view: top level - weight - bm25plusweight.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 7822d31adece Lines: 57 86 66.3 %
Date: 2019-05-23 11:15:29 Functions: 9 11 81.8 %
Branches: 38 132 28.8 %

           Branch data     Line data    Source code
       1                 :            : /** @file bm25plusweight.cc
       2                 :            :  * @brief Xapian::BM25PlusWeight class - the BM25+ probabilistic formula
       3                 :            :  */
       4                 :            : /* Copyright (C) 2009,2010,2011,2012,2014,2015,2016 Olly Betts
       5                 :            :  * Copyright (C) 2016  Vivek Pal
       6                 :            :  *
       7                 :            :  * This program is free software; you can redistribute it and/or
       8                 :            :  * modify it under the terms of the GNU General Public License as
       9                 :            :  * published by the Free Software Foundation; either version 2 of the
      10                 :            :  * License, or (at your option) any later version.
      11                 :            :  *
      12                 :            :  * This program is distributed in the hope that it will be useful
      13                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      14                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
      15                 :            :  * GNU General Public License for more details.
      16                 :            :  *
      17                 :            :  * You should have received a copy of the GNU General Public License
      18                 :            :  * along with this program; if not, write to the Free Software
      19                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
      20                 :            :  */
      21                 :            : 
      22                 :            : #include <config.h>
      23                 :            : 
      24                 :            : #include "xapian/weight.h"
      25                 :            : #include "weightinternal.h"
      26                 :            : 
      27                 :            : #include "debuglog.h"
      28                 :            : #include "omassert.h"
      29                 :            : #include "serialise-double.h"
      30                 :            : 
      31                 :            : #include "xapian/error.h"
      32                 :            : 
      33                 :            : #include <algorithm>
      34                 :            : #include <cmath>
      35                 :            : 
      36                 :            : using namespace std;
      37                 :            : 
      38                 :            : namespace Xapian {
      39                 :            : 
      40                 :            : BM25PlusWeight *
      41                 :         69 : BM25PlusWeight::clone() const
      42                 :            : {
      43                 :            :     return new BM25PlusWeight(param_k1, param_k2, param_k3, param_b,
      44                 :         69 :                               param_min_normlen, param_delta);
      45                 :            : }
      46                 :            : 
      47                 :            : void
      48                 :         48 : BM25PlusWeight::init(double factor)
      49                 :            : {
      50                 :         48 :     Xapian::doccount tf = get_termfreq();
      51                 :            : 
      52         [ +  + ]:         48 :     if (rare(tf == 0)) {
      53                 :         24 :         termweight = 0;
      54                 :            :     } else {
      55                 :            :         // BM25+ formula uses IDF = log((total_no_of_docs + 1) / tf)
      56                 :         24 :         termweight = log(double(get_collection_size() + 1) / tf);
      57                 :         24 :         termweight *= factor;
      58         [ +  - ]:         24 :         if (param_k3 != 0) {
      59                 :         24 :             double wqf_double = get_wqf();
      60                 :         24 :             termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
      61                 :            :         }
      62                 :            :     }
      63                 :            : 
      64                 :            :     LOGVALUE(WTCALC, termweight);
      65                 :            : 
      66 [ +  - ][ +  + ]:         48 :     if (param_k2 == 0 && (param_b == 0 || param_k1 == 0)) {
                 [ +  + ]
      67                 :            :         // If k2 is 0, and either param_b or param_k1 is 0 then the document
      68                 :            :         // length doesn't affect the weight.
      69                 :         32 :         len_factor = 0;
      70                 :            :     } else {
      71                 :         16 :         len_factor = get_average_length();
      72                 :            :         // len_factor can be zero if all documents are empty (or the database
      73                 :            :         // is empty!)
      74         [ +  - ]:         16 :         if (len_factor != 0) len_factor = 1 / len_factor;
      75                 :            :     }
      76                 :            : 
      77                 :            :     LOGVALUE(WTCALC, len_factor);
      78                 :         48 : }
      79                 :            : 
      80                 :            : string
      81                 :       1454 : BM25PlusWeight::name() const
      82                 :            : {
      83         [ +  - ]:       1454 :     return "Xapian::BM25PlusWeight";
      84                 :            : }
      85                 :            : 
      86                 :            : string
      87                 :       1447 : BM25PlusWeight::short_name() const
      88                 :            : {
      89         [ +  - ]:       1447 :     return "bm25plus";
      90                 :            : }
      91                 :            : 
      92                 :            : string
      93                 :         14 : BM25PlusWeight::serialise() const
      94                 :            : {
      95                 :         14 :     string result = serialise_double(param_k1);
      96 [ +  - ][ +  - ]:         14 :     result += serialise_double(param_k2);
      97 [ +  - ][ +  - ]:         14 :     result += serialise_double(param_k3);
      98 [ +  - ][ +  - ]:         14 :     result += serialise_double(param_b);
      99 [ +  - ][ +  - ]:         14 :     result += serialise_double(param_min_normlen);
     100 [ +  - ][ +  - ]:         14 :     result += serialise_double(param_delta);
     101                 :         14 :     return result;
     102                 :            : }
     103                 :            : 
     104                 :            : BM25PlusWeight *
     105                 :          8 : BM25PlusWeight::unserialise(const string & s) const
     106                 :            : {
     107                 :          8 :     const char *ptr = s.data();
     108                 :          8 :     const char *end = ptr + s.size();
     109         [ +  - ]:          8 :     double k1 = unserialise_double(&ptr, end);
     110         [ +  - ]:          8 :     double k2 = unserialise_double(&ptr, end);
     111         [ +  - ]:          8 :     double k3 = unserialise_double(&ptr, end);
     112         [ +  - ]:          8 :     double b = unserialise_double(&ptr, end);
     113         [ +  - ]:          8 :     double min_normlen = unserialise_double(&ptr, end);
     114         [ +  - ]:          8 :     double delta = unserialise_double(&ptr, end);
     115         [ +  + ]:          8 :     if (rare(ptr != end))
     116 [ +  - ][ +  - ]:          1 :         throw Xapian::SerialisationError("Extra data in BM25PlusWeight::unserialise()");
                 [ +  - ]
     117         [ +  - ]:          7 :     return new BM25PlusWeight(k1, k2, k3, b, min_normlen, delta);
     118                 :            : }
     119                 :            : 
     120                 :            : double
     121                 :        105 : BM25PlusWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
     122                 :            :                             Xapian::termcount) const
     123                 :            : {
     124                 :            :     LOGCALL(WTCALC, double, "BM25PlusWeight::get_sumpart", wdf | len);
     125                 :        105 :     Xapian::doclength normlen = max(len * len_factor, param_min_normlen);
     126                 :            : 
     127                 :        105 :     double wdf_double = wdf;
     128                 :        105 :     double denom = param_k1 * (normlen * param_b + (1 - param_b)) + wdf_double;
     129                 :            :     AssertRel(denom,>,0);
     130                 :            :     // Parameter delta (δ) is a pseudo tf value to control the scale of the
     131                 :            :     // tf lower bound. δ can be tuned for e.g from 0.0 to 1.5 but BM25+ can
     132                 :            :     // still work effectively across collections with a fixed δ = 1.0
     133                 :        105 :     RETURN(termweight * ((param_k1 + 1) * wdf_double / denom + param_delta));
     134                 :            : }
     135                 :            : 
     136                 :            : double
     137                 :         48 : BM25PlusWeight::get_maxpart() const
     138                 :            : {
     139                 :            :     LOGCALL(WTCALC, double, "BM25PlusWeight::get_maxpart", NO_ARGS);
     140                 :         48 :     double denom = param_k1;
     141         [ +  + ]:         48 :     if (param_k1 != 0.0) {
     142         [ +  + ]:         32 :         if (param_b != 0.0) {
     143                 :            :             // "Upper-bound Approximations for Dynamic Pruning" Craig
     144                 :            :             // Macdonald, Nicola Tonellotto and Iadh Ounis. ACM Transactions on
     145                 :            :             // Information Systems. 29(4), 2011 shows that evaluating at
     146                 :            :             // doclen=wdf_max is a good bound.
     147                 :            :             //
     148                 :            :             // However, we can do better if doclen_min > wdf_max since then a
     149                 :            :             // better bound can be found by simply evaluating at
     150                 :            :             // doclen=doclen_min and wdf=wdf_max.
     151                 :            :             Xapian::doclength normlen_lb =
     152                 :         16 :                  max(max(get_wdf_upper_bound(), get_doclength_lower_bound()) * len_factor, param_min_normlen);
     153                 :         32 :             denom *= (normlen_lb * param_b + (1 - param_b));
     154                 :            :         }
     155                 :            :     }
     156                 :         48 :     double wdf_max = get_wdf_upper_bound();
     157                 :         48 :     denom += wdf_max;
     158                 :            :     AssertRel(denom,>,0);
     159                 :         48 :     RETURN(termweight * ((param_k1 + 1) * wdf_max / denom + param_delta));
     160                 :            : }
     161                 :            : 
     162                 :            : /* The paper which describes BM25+ ignores BM25's document-independent
     163                 :            :  * component (so implicitly k2=0), but we support non-zero k2 too.
     164                 :            :  *
     165                 :            :  * The BM25 formula gives:
     166                 :            :  *
     167                 :            :  * param_k2 * query_length * (1 - normlen) / (1 + normlen)
     168                 :            :  *
     169                 :            :  * To avoid negative sumextra we add the constant (param_k2 * query_length)
     170                 :            :  * to give:
     171                 :            :  *
     172                 :            :  * 2 * param_k2 * query_length / (1 + normlen)
     173                 :            :  */
     174                 :            : double
     175                 :          0 : BM25PlusWeight::get_sumextra(Xapian::termcount len, Xapian::termcount) const
     176                 :            : {
     177                 :            :     LOGCALL(WTCALC, double, "BM25PlusWeight::get_sumextra", len);
     178                 :          0 :     double num = (2.0 * param_k2 * get_query_length());
     179                 :          0 :     RETURN(num / (1.0 + max(len * len_factor, param_min_normlen)));
     180                 :            : }
     181                 :            : 
     182                 :            : double
     183                 :         24 : BM25PlusWeight::get_maxextra() const
     184                 :            : {
     185                 :            :     LOGCALL(WTCALC, double, "BM25PlusWeight::get_maxextra", NO_ARGS);
     186         [ +  - ]:         24 :     if (param_k2 == 0.0)
     187                 :         24 :         RETURN(0.0);
     188                 :          0 :     double num = (2.0 * param_k2 * get_query_length());
     189                 :          0 :     RETURN(num / (1.0 + max(get_doclength_lower_bound() * len_factor,
     190                 :            :                             param_min_normlen)));
     191                 :            : }
     192                 :            : 
     193                 :            : BM25PlusWeight *
     194                 :          0 : BM25PlusWeight::create_from_parameters(const char * p) const
     195                 :            : {
     196         [ #  # ]:          0 :     if (*p == '\0')
     197         [ #  # ]:          0 :         return new Xapian::BM25PlusWeight();
     198                 :          0 :     double k1 = 1;
     199                 :          0 :     double k2 = 0;
     200                 :          0 :     double k3 = 1;
     201                 :          0 :     double b = 0.5;
     202                 :          0 :     double min_normlen = 0.5;
     203                 :          0 :     double delta = 1.0;
     204         [ #  # ]:          0 :     if (!Xapian::Weight::Internal::double_param(&p, &k1))
     205 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 1 (k1) is invalid", "bm25plus");
     206 [ #  # ][ #  # ]:          0 :     if (*p && !Xapian::Weight::Internal::double_param(&p, &k2))
                 [ #  # ]
     207 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 2 (k2) is invalid", "bm25plus");
     208 [ #  # ][ #  # ]:          0 :     if (*p && !Xapian::Weight::Internal::double_param(&p, &k3))
                 [ #  # ]
     209 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 3 (k3) is invalid", "bm25plus");
     210 [ #  # ][ #  # ]:          0 :     if (*p && !Xapian::Weight::Internal::double_param(&p, &b))
                 [ #  # ]
     211 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 4 (b) is invalid", "bm25plus");
     212 [ #  # ][ #  # ]:          0 :     if (*p && !Xapian::Weight::Internal::double_param(&p, &min_normlen))
                 [ #  # ]
     213 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 5 (min_normlen) is invalid", "bm25plus");
     214 [ #  # ][ #  # ]:          0 :     if (*p && !Xapian::Weight::Internal::double_param(&p, &delta))
                 [ #  # ]
     215 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 6 (delta) is invalid", "bm25plus");
     216         [ #  # ]:          0 :     if (*p)
     217 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Extra data after parameter 6", "bm25plus");
     218         [ #  # ]:          0 :     return new Xapian::BM25PlusWeight(k1, k2, k3, b, min_normlen, delta);
     219                 :            : }
     220                 :            : 
     221                 :            : }

Generated by: LCOV version 1.11