LCOV - code coverage report
Current view: top level - weight - tradweight.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core c2b6f1024d3a Lines: 44 55 80.0 %
Date: 2019-05-16 09:13:18 Functions: 9 11 81.8 %
Branches: 18 44 40.9 %

           Branch data     Line data    Source code
       1                 :            : /** @file tradweight.cc
       2                 :            :  * @brief Xapian::TradWeight class - the "traditional" probabilistic formula
       3                 :            :  */
       4                 :            : /* Copyright (C) 2009,2010,2011,2012,2014,2015,2017 Olly Betts
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or
       7                 :            :  * modify it under the terms of the GNU General Public License as
       8                 :            :  * published by the Free Software Foundation; either version 2 of the
       9                 :            :  * License, or (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
      19                 :            :  */
      20                 :            : 
      21                 :            : #include <config.h>
      22                 :            : 
      23                 :            : #include "xapian/weight.h"
      24                 :            : #include "weightinternal.h"
      25                 :            : 
      26                 :            : #include "debuglog.h"
      27                 :            : #include "omassert.h"
      28                 :            : #include "serialise-double.h"
      29                 :            : 
      30                 :            : #include "xapian/error.h"
      31                 :            : 
      32                 :            : #include <algorithm>
      33                 :            : #include <cmath>
      34                 :            : 
      35                 :            : using namespace std;
      36                 :            : 
      37                 :            : namespace Xapian {
      38                 :            : 
      39                 :            : TradWeight *
      40                 :        124 : TradWeight::clone() const
      41                 :            : {
      42                 :        124 :     return new TradWeight(param_k);
      43                 :            : }
      44                 :            : 
      45                 :            : void
      46                 :         96 : TradWeight::init(double factor)
      47                 :            : {
      48         [ +  + ]:         96 :     if (factor == 0.0) {
      49                 :            :         // This object is for the term-independent contribution, and that's
      50                 :            :         // always zero for this scheme.
      51                 :         40 :         return;
      52                 :            :     }
      53                 :            : 
      54                 :         56 :     Xapian::doccount tf = get_termfreq();
      55                 :            : 
      56                 :         56 :     double tw = 0;
      57         [ +  + ]:         56 :     if (get_rset_size() != 0) {
      58                 :         16 :         Xapian::doccount reltermfreq = get_reltermfreq();
      59                 :            : 
      60                 :            :         // There can't be more relevant documents indexed by a term than there
      61                 :            :         // are documents indexed by that term.
      62                 :            :         AssertRel(reltermfreq,<=,tf);
      63                 :            : 
      64                 :            :         // There can't be more relevant documents indexed by a term than there
      65                 :            :         // are relevant documents.
      66                 :            :         AssertRel(reltermfreq,<=,get_rset_size());
      67                 :            : 
      68                 :         16 :         Xapian::doccount reldocs_not_indexed = get_rset_size() - reltermfreq;
      69                 :            : 
      70                 :            :         // There can't be more relevant documents not indexed by a term than
      71                 :            :         // there are documents not indexed by that term.
      72                 :            :         AssertRel(reldocs_not_indexed,<=,get_collection_size() - tf);
      73                 :            : 
      74                 :         16 :         Xapian::doccount Q = get_collection_size() - reldocs_not_indexed;
      75                 :            : 
      76                 :         16 :         Xapian::doccount nonreldocs_indexed = tf - reltermfreq;
      77                 :         16 :         double numerator = (reltermfreq + 0.5) * (Q - tf + 0.5);
      78                 :         16 :         double denom = (reldocs_not_indexed + 0.5) * (nonreldocs_indexed + 0.5);
      79                 :         16 :         tw = numerator / denom;
      80                 :            :     } else {
      81                 :         40 :         tw = (get_collection_size() - tf + 0.5) / (tf + 0.5);
      82                 :            :     }
      83                 :            : 
      84                 :            :     AssertRel(tw,>,0);
      85                 :            : 
      86                 :            :     // The "official" formula can give a negative termweight in unusual cases
      87                 :            :     // (without an RSet, when a term indexes more than half the documents in
      88                 :            :     // the database).  These negative weights aren't actually helpful, and it
      89                 :            :     // is common for implementations to replace them with a small positive
      90                 :            :     // weight or similar.
      91                 :            :     //
      92                 :            :     // Truncating to zero doesn't seem a great approach in practice as it
      93                 :            :     // means that some terms in the query can have no effect at all on the
      94                 :            :     // ranking, and that some results can have zero weight, both of which
      95                 :            :     // are seem surprising.
      96                 :            :     //
      97                 :            :     // Xapian 1.0.x and earlier adjusted the termweight for any term indexing
      98                 :            :     // more than a third of documents, which seems rather "intrusive".  That's
      99                 :            :     // what the code currently enabled does, but perhaps it would be better to
     100                 :            :     // do something else. (FIXME)
     101                 :            : #if 0
     102                 :            :     if (rare(tw <= 1.0)) {
     103                 :            :         termweight = 0;
     104                 :            :     } else {
     105                 :            :         termweight = log(tw) * factor;
     106                 :            :     }
     107                 :            : #else
     108         [ +  + ]:         56 :     if (tw < 2) tw = tw * 0.5 + 1;
     109                 :         56 :     termweight = log(tw) * factor;
     110                 :            : #endif
     111                 :            : 
     112                 :            :     LOGVALUE(WTCALC, termweight);
     113                 :            : 
     114         [ +  + ]:         56 :     if (param_k == 0) {
     115                 :            :         // If param_k is 0 then the document length doesn't affect the weight.
     116                 :         16 :         len_factor = 0;
     117                 :            :     } else {
     118                 :         40 :         len_factor = get_average_length();
     119                 :            :         // len_factor can be zero if all documents are empty (or the database is
     120                 :            :         // empty!)
     121         [ +  - ]:         40 :         if (len_factor != 0) len_factor = param_k / len_factor;
     122                 :            :     }
     123                 :            : 
     124                 :            :     LOGVALUE(WTCALC, len_factor);
     125                 :            : }
     126                 :            : 
     127                 :            : string
     128                 :       1458 : TradWeight::name() const
     129                 :            : {
     130         [ +  - ]:       1458 :     return "Xapian::TradWeight";
     131                 :            : }
     132                 :            : 
     133                 :            : string
     134                 :       1447 : TradWeight::short_name() const
     135                 :            : {
     136         [ +  - ]:       1447 :     return "trad";
     137                 :            : }
     138                 :            : 
     139                 :            : string
     140                 :         18 : TradWeight::serialise() const
     141                 :            : {
     142                 :         18 :     return serialise_double(param_k);
     143                 :            : }
     144                 :            : 
     145                 :            : TradWeight *
     146                 :         12 : TradWeight::unserialise(const string & s) const
     147                 :            : {
     148                 :         12 :     const char *ptr = s.data();
     149                 :         12 :     const char *end = ptr + s.size();
     150         [ +  - ]:         12 :     double k = unserialise_double(&ptr, end);
     151         [ +  + ]:         12 :     if (rare(ptr != end))
     152 [ +  - ][ +  - ]:          1 :         throw Xapian::SerialisationError("Extra data in TradWeight::unserialise()");
                 [ +  - ]
     153         [ +  - ]:         11 :     return new TradWeight(k);
     154                 :            : }
     155                 :            : 
     156                 :            : double
     157                 :       2961 : TradWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
     158                 :            :                         Xapian::termcount) const
     159                 :            : {
     160                 :       2961 :     double wdf_double = wdf;
     161                 :       2961 :     return termweight * (wdf_double / (len * len_factor + wdf_double));
     162                 :            : }
     163                 :            : 
     164                 :            : double
     165                 :        113 : TradWeight::get_maxpart() const
     166                 :            : {
     167                 :            :     // FIXME: need to force non-zero wdf_max to stop percentages breaking...
     168                 :        113 :     double wdf_max = max(get_wdf_upper_bound(), Xapian::termcount(1));
     169                 :        113 :     Xapian::termcount doclen_lb = get_doclength_lower_bound();
     170                 :        113 :     return termweight * (wdf_max / (doclen_lb * len_factor + wdf_max));
     171                 :            : }
     172                 :            : 
     173                 :            : double
     174                 :          0 : TradWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
     175                 :            : {
     176                 :          0 :     return 0;
     177                 :            : }
     178                 :            : 
     179                 :            : double
     180                 :         40 : TradWeight::get_maxextra() const
     181                 :            : {
     182                 :         40 :     return 0;
     183                 :            : }
     184                 :            : 
     185                 :            : TradWeight *
     186                 :          0 : TradWeight::create_from_parameters(const char * p) const
     187                 :            : {
     188         [ #  # ]:          0 :     if (*p == '\0')
     189         [ #  # ]:          0 :         return new Xapian::TradWeight();
     190                 :          0 :     double k = 1.0;
     191         [ #  # ]:          0 :     if (!Xapian::Weight::Internal::double_param(&p, &k))
     192 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter is invalid", "trad");
     193         [ #  # ]:          0 :     if (*p)
     194 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Extra data after parameter", "trad");
     195         [ #  # ]:          0 :     return new Xapian::TradWeight(k);
     196                 :            : }
     197                 :            : 
     198                 :            : }

Generated by: LCOV version 1.11