LCOV - code coverage report
Current view: top level - weight - bm25weight.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 7822d31adece Lines: 69 90 76.7 %
Date: 2019-05-23 11:15:29 Functions: 10 11 90.9 %
Branches: 41 118 34.7 %

           Branch data     Line data    Source code
       1                 :            : /** @file bm25weight.cc
       2                 :            :  * @brief Xapian::BM25Weight class - the BM25 probabilistic formula
       3                 :            :  */
       4                 :            : /* Copyright (C) 2009,2010,2011,2012,2014,2015 Olly Betts
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or
       7                 :            :  * modify it under the terms of the GNU General Public License as
       8                 :            :  * published by the Free Software Foundation; either version 2 of the
       9                 :            :  * License, or (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
      19                 :            :  */
      20                 :            : 
      21                 :            : #include <config.h>
      22                 :            : 
      23                 :            : #include "xapian/weight.h"
      24                 :            : #include "weightinternal.h"
      25                 :            : 
      26                 :            : #include "debuglog.h"
      27                 :            : #include "omassert.h"
      28                 :            : #include "serialise-double.h"
      29                 :            : 
      30                 :            : #include "xapian/error.h"
      31                 :            : 
      32                 :            : #include <algorithm>
      33                 :            : #include <cmath>
      34                 :            : 
      35                 :            : using namespace std;
      36                 :            : 
      37                 :            : namespace Xapian {
      38                 :            : 
      39                 :            : BM25Weight *
      40                 :     484192 : BM25Weight::clone() const
      41                 :            : {
      42                 :            :     return new BM25Weight(param_k1, param_k2, param_k3, param_b,
      43                 :     484192 :                           param_min_normlen);
      44                 :            : }
      45                 :            : 
      46                 :            : void
      47                 :     484154 : BM25Weight::init(double factor)
      48                 :            : {
      49                 :     484154 :     Xapian::doccount tf = get_termfreq();
      50                 :            : 
      51                 :     484154 :     double tw = 0;
      52         [ +  + ]:     484154 :     if (get_rset_size() != 0) {
      53                 :        186 :         Xapian::doccount reltermfreq = get_reltermfreq();
      54                 :            : 
      55                 :            :         // There can't be more relevant documents indexed by a term than there
      56                 :            :         // are documents indexed by that term.
      57                 :            :         AssertRel(reltermfreq,<=,tf);
      58                 :            : 
      59                 :            :         // There can't be more relevant documents indexed by a term than there
      60                 :            :         // are relevant documents.
      61                 :            :         AssertRel(reltermfreq,<=,get_rset_size());
      62                 :            : 
      63                 :        186 :         Xapian::doccount reldocs_not_indexed = get_rset_size() - reltermfreq;
      64                 :            : 
      65                 :            :         // There can't be more relevant documents not indexed by a term than
      66                 :            :         // there are documents not indexed by that term.
      67                 :            :         AssertRel(reldocs_not_indexed,<=,get_collection_size() - tf);
      68                 :            : 
      69                 :        186 :         Xapian::doccount Q = get_collection_size() - reldocs_not_indexed;
      70                 :            : 
      71                 :        186 :         Xapian::doccount nonreldocs_indexed = tf - reltermfreq;
      72                 :        186 :         double numerator = (reltermfreq + 0.5) * (Q - tf + 0.5);
      73                 :        186 :         double denom = (reldocs_not_indexed + 0.5) * (nonreldocs_indexed + 0.5);
      74                 :        186 :         tw = numerator / denom;
      75                 :            :     } else {
      76                 :     483968 :         tw = (get_collection_size() - tf + 0.5) / (tf + 0.5);
      77                 :            :     }
      78                 :            : 
      79                 :            :     AssertRel(tw,>,0);
      80                 :            : 
      81                 :            :     // The "official" formula can give a negative termweight in unusual cases
      82                 :            :     // (without an RSet, when a term indexes more than half the documents in
      83                 :            :     // the database).  These negative weights aren't actually helpful, and it
      84                 :            :     // is common for implementations to replace them with a small positive
      85                 :            :     // weight or similar.
      86                 :            :     //
      87                 :            :     // Truncating to zero doesn't seem a great approach in practice as it
      88                 :            :     // means that some terms in the query can have no effect at all on the
      89                 :            :     // ranking, and that some results can have zero weight, both of which
      90                 :            :     // are seem surprising.
      91                 :            :     //
      92                 :            :     // Xapian 1.0.x and earlier adjusted the termweight for any term indexing
      93                 :            :     // more than a third of documents, which seems rather "intrusive".  That's
      94                 :            :     // what the code currently enabled does, but perhaps it would be better to
      95                 :            :     // do something else. (FIXME)
      96                 :            : #if 0
      97                 :            :     if (rare(tw <= 1.0)) {
      98                 :            :         termweight = 0;
      99                 :            :     } else {
     100                 :            :         termweight = log(tw) * factor;
     101                 :            :         if (param_k3 != 0) {
     102                 :            :             double wqf_double = get_wqf();
     103                 :            :             termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
     104                 :            :         }
     105                 :            :     }
     106                 :            : #else
     107         [ +  + ]:     484154 :     if (tw < 2) tw = tw * 0.5 + 1;
     108                 :     484154 :     termweight = log(tw) * factor;
     109         [ +  + ]:     484154 :     if (param_k3 != 0) {
     110                 :     484138 :         double wqf_double = get_wqf();
     111                 :     484138 :         termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
     112                 :            :     }
     113                 :            : #endif
     114                 :     484154 :     termweight *= (param_k1 + 1);
     115                 :            : 
     116                 :            :     LOGVALUE(WTCALC, termweight);
     117                 :            : 
     118 [ +  + ][ +  + ]:     484154 :     if (param_k2 == 0 && (param_b == 0 || param_k1 == 0)) {
                 [ +  + ]
     119                 :            :         // If k2 is 0, and either param_b or param_k1 is 0 then the document
     120                 :            :         // length doesn't affect the weight.
     121                 :         48 :         len_factor = 0;
     122                 :            :     } else {
     123                 :     484106 :         len_factor = get_average_length();
     124                 :            :         // len_factor can be zero if all documents are empty (or the database
     125                 :            :         // is empty!)
     126         [ +  + ]:     484106 :         if (len_factor != 0) len_factor = 1 / len_factor;
     127                 :            :     }
     128                 :            : 
     129                 :            :     LOGVALUE(WTCALC, len_factor);
     130                 :     484154 : }
     131                 :            : 
     132                 :            : string
     133                 :      11639 : BM25Weight::name() const
     134                 :            : {
     135         [ +  - ]:      11639 :     return "Xapian::BM25Weight";
     136                 :            : }
     137                 :            : 
     138                 :            : string
     139                 :       1450 : BM25Weight::short_name() const
     140                 :            : {
     141         [ +  - ]:       1450 :     return "bm25";
     142                 :            : }
     143                 :            : 
     144                 :            : string
     145                 :      10196 : BM25Weight::serialise() const
     146                 :            : {
     147                 :      10196 :     string result = serialise_double(param_k1);
     148 [ +  - ][ +  - ]:      10196 :     result += serialise_double(param_k2);
     149 [ +  - ][ +  - ]:      10196 :     result += serialise_double(param_k3);
     150 [ +  - ][ +  - ]:      10196 :     result += serialise_double(param_b);
     151 [ +  - ][ +  - ]:      10196 :     result += serialise_double(param_min_normlen);
     152                 :      10196 :     return result;
     153                 :            : }
     154                 :            : 
     155                 :            : BM25Weight *
     156                 :      10188 : BM25Weight::unserialise(const string & s) const
     157                 :            : {
     158                 :      10188 :     const char *ptr = s.data();
     159                 :      10188 :     const char *end = ptr + s.size();
     160         [ +  - ]:      10188 :     double k1 = unserialise_double(&ptr, end);
     161         [ +  - ]:      10188 :     double k2 = unserialise_double(&ptr, end);
     162         [ +  - ]:      10188 :     double k3 = unserialise_double(&ptr, end);
     163         [ +  - ]:      10188 :     double b = unserialise_double(&ptr, end);
     164         [ +  - ]:      10188 :     double min_normlen = unserialise_double(&ptr, end);
     165         [ +  + ]:      10188 :     if (rare(ptr != end))
     166 [ +  - ][ +  - ]:          1 :         throw Xapian::SerialisationError("Extra data in BM25Weight::unserialise()");
                 [ +  - ]
     167         [ +  - ]:      10187 :     return new BM25Weight(k1, k2, k3, b, min_normlen);
     168                 :            : }
     169                 :            : 
     170                 :            : double
     171                 :   47685349 : BM25Weight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
     172                 :            :                         Xapian::termcount) const
     173                 :            : {
     174                 :            :     LOGCALL(WTCALC, double, "BM25Weight::get_sumpart", wdf | len);
     175                 :   47685349 :     Xapian::doclength normlen = max(len * len_factor, param_min_normlen);
     176                 :            : 
     177                 :   47685349 :     double wdf_double = wdf;
     178                 :   47685349 :     double denom = param_k1 * (normlen * param_b + (1 - param_b)) + wdf_double;
     179                 :            :     AssertRel(denom,>,0);
     180                 :   47685349 :     RETURN(termweight * (wdf_double / denom));
     181                 :            : }
     182                 :            : 
     183                 :            : double
     184                 :     819204 : BM25Weight::get_maxpart() const
     185                 :            : {
     186                 :            :     LOGCALL(WTCALC, double, "BM25Weight::get_maxpart", NO_ARGS);
     187                 :     819204 :     double denom = param_k1;
     188         [ +  + ]:     819204 :     if (param_k1 != 0.0) {
     189         [ +  + ]:     819155 :         if (param_b != 0.0) {
     190                 :            :             // "Upper-bound Approximations for Dynamic Pruning" Craig
     191                 :            :             // Macdonald, Nicola Tonellotto and Iadh Ounis. ACM Transactions on
     192                 :            :             // Information Systems. 29(4), 2011 shows that evaluating at
     193                 :            :             // doclen=wdf_max is a good bound.
     194                 :            :             //
     195                 :            :             // However, we can do better if doclen_min > wdf_max since then a
     196                 :            :             // better bound can be found by simply evaluating at
     197                 :            :             // doclen=doclen_min and wdf=wdf_max.
     198                 :            :             Xapian::doclength normlen_lb =
     199                 :     819139 :                  max(max(get_wdf_upper_bound(), get_doclength_lower_bound()) * len_factor, param_min_normlen);
     200                 :     819155 :             denom *= (normlen_lb * param_b + (1 - param_b));
     201                 :            :         }
     202                 :            :     }
     203                 :     819204 :     double wdf_max = get_wdf_upper_bound();
     204                 :     819204 :     denom += wdf_max;
     205                 :            :     AssertRel(denom,>,0);
     206                 :     819204 :     RETURN(termweight * (wdf_max / denom));
     207                 :            : }
     208                 :            : 
     209                 :            : /* The BM25 formula gives:
     210                 :            :  *
     211                 :            :  * param_k2 * query_length * (1 - normlen) / (1 + normlen)
     212                 :            :  *
     213                 :            :  * To avoid negative sumextra we add the constant (param_k2 * query_length)
     214                 :            :  * to give:
     215                 :            :  *
     216                 :            :  * 2 * param_k2 * query_length / (1 + normlen)
     217                 :            :  */
     218                 :            : double
     219                 :         49 : BM25Weight::get_sumextra(Xapian::termcount len, Xapian::termcount) const
     220                 :            : {
     221                 :            :     LOGCALL(WTCALC, double, "BM25Weight::get_sumextra", len);
     222                 :         49 :     double num = (2.0 * param_k2 * get_query_length());
     223                 :         49 :     RETURN(num / (1.0 + max(len * len_factor, param_min_normlen)));
     224                 :            : }
     225                 :            : 
     226                 :            : double
     227                 :     175848 : BM25Weight::get_maxextra() const
     228                 :            : {
     229                 :            :     LOGCALL(WTCALC, double, "BM25Weight::get_maxextra", NO_ARGS);
     230         [ +  + ]:     175848 :     if (param_k2 == 0.0)
     231                 :     175816 :         RETURN(0.0);
     232                 :         32 :     double num = (2.0 * param_k2 * get_query_length());
     233                 :         32 :     RETURN(num / (1.0 + max(get_doclength_lower_bound() * len_factor,
     234                 :            :                             param_min_normlen)));
     235                 :            : }
     236                 :            : 
     237                 :            : BM25Weight *
     238                 :          0 : BM25Weight::create_from_parameters(const char * p) const
     239                 :            : {
     240         [ #  # ]:          0 :     if (*p == '\0')
     241         [ #  # ]:          0 :         return new Xapian::BM25Weight();
     242                 :          0 :     double k1 = 1;
     243                 :          0 :     double k2 = 0;
     244                 :          0 :     double k3 = 1;
     245                 :          0 :     double b = 0.5;
     246                 :          0 :     double min_normlen = 0.5;
     247         [ #  # ]:          0 :     if (!Xapian::Weight::Internal::double_param(&p, &k1))
     248 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 1 (k1) is invalid", "bm25");
     249 [ #  # ][ #  # ]:          0 :     if (*p && !Xapian::Weight::Internal::double_param(&p, &k2))
                 [ #  # ]
     250 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 2 (k2) is invalid", "bm25");
     251 [ #  # ][ #  # ]:          0 :     if (*p && !Xapian::Weight::Internal::double_param(&p, &k3))
                 [ #  # ]
     252 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 3 (k3) is invalid", "bm25");
     253 [ #  # ][ #  # ]:          0 :     if (*p && !Xapian::Weight::Internal::double_param(&p, &b))
                 [ #  # ]
     254 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 4 (b) is invalid", "bm25");
     255 [ #  # ][ #  # ]:          0 :     if (*p && !Xapian::Weight::Internal::double_param(&p, &min_normlen))
                 [ #  # ]
     256 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 5 (min_normlen) is invalid", "bm25");
     257         [ #  # ]:          0 :     if (*p)
     258 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Extra data after parameter 5", "bm25");
     259         [ #  # ]:          0 :     return new Xapian::BM25Weight(k1, k2, k3, b, min_normlen);
     260                 :            : }
     261                 :            : 
     262                 :            : }

Generated by: LCOV version 1.11