LCOV - code coverage report
Current view: top level - weight - pl2weight.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 954b5873a738 Lines: 57 70 81.4 %
Date: 2019-06-30 05:20:33 Functions: 10 12 83.3 %
Branches: 24 62 38.7 %

           Branch data     Line data    Source code
       1                 :            : /** @file pl2weight.cc
       2                 :            :  * @brief Xapian::PL2Weight class - the PL2 weighting scheme of the DFR framework.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2013 Aarsh Shah
       5                 :            :  * Copyright (C) 2013,2014,2016 Olly Betts
       6                 :            :  *
       7                 :            :  * This program is free software; you can redistribute it and/or
       8                 :            :  * modify it under the terms of the GNU General Public License as
       9                 :            :  * published by the Free Software Foundation; either version 2 of the
      10                 :            :  * License, or (at your option) any later version.
      11                 :            :  *
      12                 :            :  * This program is distributed in the hope that it will be useful
      13                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      14                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
      15                 :            :  * GNU General Public License for more details.
      16                 :            :  *
      17                 :            :  * You should have received a copy of the GNU General Public License
      18                 :            :  * along with this program; if not, write to the Free Software
      19                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
      20                 :            :  */
      21                 :            : 
      22                 :            : #include <config.h>
      23                 :            : 
      24                 :            : #include "xapian/weight.h"
      25                 :            : #include "common/log2.h"
      26                 :            : #include "weightinternal.h"
      27                 :            : 
      28                 :            : #include "serialise-double.h"
      29                 :            : 
      30                 :            : #include "xapian/error.h"
      31                 :            : 
      32                 :            : #include <algorithm>
      33                 :            : 
      34                 :            : using namespace std;
      35                 :            : 
      36                 :            : namespace Xapian {
      37                 :            : 
      38                 :        141 : PL2Weight::PL2Weight(double c) : param_c(c)
      39                 :            : {
      40         [ +  + ]:         70 :     if (param_c <= 0)
      41 [ +  - ][ +  - ]:          1 :         throw Xapian::InvalidArgumentError("Parameter c is invalid.");
                 [ +  - ]
      42                 :         69 :     need_stat(AVERAGE_LENGTH);
      43                 :         69 :     need_stat(DOC_LENGTH);
      44                 :         69 :     need_stat(DOC_LENGTH_MIN);
      45                 :         69 :     need_stat(DOC_LENGTH_MAX);
      46                 :         69 :     need_stat(COLLECTION_SIZE);
      47                 :         69 :     need_stat(COLLECTION_FREQ);
      48                 :         69 :     need_stat(WDF);
      49                 :         69 :     need_stat(WDF_MAX);
      50                 :         69 :     need_stat(WQF);
      51                 :         69 : }
      52                 :            : 
      53                 :            : PL2Weight *
      54                 :         46 : PL2Weight::clone() const
      55                 :            : {
      56         [ +  - ]:         46 :     return new PL2Weight(param_c);
      57                 :            : }
      58                 :            : 
      59                 :            : void
      60                 :         32 : PL2Weight::init(double factor_)
      61                 :            : {
      62         [ +  + ]:         32 :     if (factor_ == 0.0) {
      63                 :            :         // This object is for the term-independent contribution, and that's
      64                 :            :         // always zero for this scheme.
      65                 :         16 :         return;
      66                 :            :     }
      67                 :            : 
      68                 :         16 :     factor = factor_;
      69                 :            : 
      70         [ -  + ]:         16 :     if (get_wdf_upper_bound() == 0) {
      71                 :            :         // The "extra" weight object is cloned, init() called and then
      72                 :            :         // get_maxextra() is called and we discover that we don't need it.
      73                 :            :         // So we need to handle that case (which will give us 0 from
      74                 :            :         // get_wdf_upper_bound() here).
      75                 :          0 :         upper_bound = 0;
      76                 :          0 :         return;
      77                 :            :     }
      78                 :            : 
      79                 :         16 :     factor *= get_wqf();
      80                 :            : 
      81                 :         16 :     cl = param_c * get_average_length();
      82                 :            : 
      83                 :         16 :     double base_change(1.0 / log(2.0));
      84                 :         16 :     double mean = double(get_collection_freq()) / get_collection_size();
      85                 :         16 :     P1 = mean * base_change + 0.5 * log2(2.0 * M_PI);
      86                 :         16 :     P2 = log2(mean) + base_change;
      87                 :            : 
      88                 :         16 :     double wdfn_lower = log2(1 + cl / get_doclength_upper_bound());
      89         [ +  - ]:         16 :     double divisior = max(get_wdf_upper_bound(), get_doclength_lower_bound());
      90                 :         16 :     double wdfn_upper = get_wdf_upper_bound() * log2(1 + cl / divisior);
      91                 :            : 
      92                 :            :     // Calculate an upper bound on the weights which get_sumpart() can return.
      93                 :            :     //
      94                 :            :     // We consider the equation for P as the sum of two parts which we
      95                 :            :     // maximise individually:
      96                 :            :     //
      97                 :            :     // (a) (wdfn + 0.5) / (wdfn + 1) * log2(wdfn)
      98                 :            :     // (b) (P1 - P2 * wdfn) / (wdfn + 1)
      99                 :            :     //
     100                 :            :     // To maximise (a), the fractional part is always positive (since wdfn>0)
     101                 :            :     // and is maximised by maximising wdfn - clearer when rewritten as:
     102                 :            :     // (1 - 0.5 / (wdfn + 1))
     103                 :            :     //
     104                 :            :     // The log part of (a) is clearly also maximised by maximising wdfn,
     105                 :            :     // so we want to evaluate (a) at wdfn=wdfn_upper.
     106                 :         16 :     double P_max2a = (wdfn_upper + 0.5) * log2(wdfn_upper) / (wdfn_upper + 1.0);
     107                 :            :     // To maximise (b) substitute x=wdfn+1 (so x>1) and we get:
     108                 :            :     //
     109                 :            :     // (P1 + P2)/x - P2
     110                 :            :     //
     111                 :            :     // Differentiating wrt x gives:
     112                 :            :     //
     113                 :            :     // -(P1 + P2)/x²
     114                 :            :     //
     115                 :            :     // So there are no local minima or maxima, and the function is continuous
     116                 :            :     // in the range of interest, so the sign of this differential tells us
     117                 :            :     // whether we want to maximise or minimise wdfn, and since x>1, we can
     118                 :            :     // just consider the sign of: (P1 + P2)
     119                 :            :     //
     120                 :            :     // Commonly P1 + P2 > 0, in which case we evaluate P at wdfn=wdfn_upper
     121                 :            :     // giving us a bound that can't be bettered if wdfn_upper is tight.
     122         [ +  - ]:         16 :     double wdfn_optb = P1 + P2 > 0 ? wdfn_upper : wdfn_lower;
     123                 :         16 :     double P_max2b = (P1 - P2 * wdfn_optb) / (wdfn_optb + 1.0);
     124                 :         16 :     upper_bound = factor * (P_max2a + P_max2b);
     125                 :            : 
     126         [ -  + ]:         32 :     if (rare(upper_bound <= 0)) upper_bound = 0;
     127                 :            : }
     128                 :            : 
     129                 :            : string
     130                 :       1564 : PL2Weight::name() const
     131                 :            : {
     132         [ +  - ]:       1564 :     return "Xapian::PL2Weight";
     133                 :            : }
     134                 :            : 
     135                 :            : string
     136                 :       1559 : PL2Weight::short_name() const
     137                 :            : {
     138         [ +  - ]:       1559 :     return "pl2";
     139                 :            : }
     140                 :            : 
     141                 :            : string
     142                 :         14 : PL2Weight::serialise() const
     143                 :            : {
     144                 :         14 :     return serialise_double(param_c);
     145                 :            : }
     146                 :            : 
     147                 :            : PL2Weight *
     148                 :          6 : PL2Weight::unserialise(const string & s) const
     149                 :            : {
     150                 :          6 :     const char *ptr = s.data();
     151                 :          6 :     const char *end = ptr + s.size();
     152         [ +  - ]:          6 :     double c = unserialise_double(&ptr, end);
     153         [ +  + ]:          6 :     if (rare(ptr != end))
     154 [ +  - ][ +  - ]:          1 :         throw Xapian::SerialisationError("Extra data in PL2Weight::unserialise()");
                 [ +  - ]
     155 [ +  - ][ +  - ]:          5 :     return new PL2Weight(c);
     156                 :            : }
     157                 :            : 
     158                 :            : double
     159                 :         70 : PL2Weight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
     160                 :            :                        Xapian::termcount) const
     161                 :            : {
     162         [ -  + ]:         70 :     if (wdf == 0) return 0.0;
     163                 :            : 
     164                 :         70 :     double wdfn = wdf * log2(1 + cl / len);
     165                 :            : 
     166                 :         70 :     double P = P1 + (wdfn + 0.5) * log2(wdfn) - P2 * wdfn;
     167         [ -  + ]:         70 :     if (rare(P <= 0)) return 0.0;
     168                 :            : 
     169                 :         70 :     return factor * P / (wdfn + 1.0);
     170                 :            : }
     171                 :            : 
     172                 :            : double
     173                 :         32 : PL2Weight::get_maxpart() const
     174                 :            : {
     175                 :         32 :     return upper_bound;
     176                 :            : }
     177                 :            : 
     178                 :            : double
     179                 :          0 : PL2Weight::get_sumextra(Xapian::termcount, Xapian::termcount) const
     180                 :            : {
     181                 :          0 :     return 0;
     182                 :            : }
     183                 :            : 
     184                 :            : double
     185                 :         16 : PL2Weight::get_maxextra() const
     186                 :            : {
     187                 :         16 :     return 0;
     188                 :            : }
     189                 :            : 
     190                 :            : PL2Weight *
     191                 :          0 : PL2Weight::create_from_parameters(const char * p) const
     192                 :            : {
     193         [ #  # ]:          0 :     if (*p == '\0')
     194         [ #  # ]:          0 :         return new Xapian::PL2Weight();
     195                 :          0 :     double k = 1.0;
     196         [ #  # ]:          0 :     if (!Xapian::Weight::Internal::double_param(&p, &k))
     197 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter is invalid", "pl2");
     198         [ #  # ]:          0 :     if (*p)
     199 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Extra data after parameter", "pl2");
     200 [ #  # ][ #  # ]:          0 :     return new Xapian::PL2Weight(k);
     201                 :            : }
     202                 :            : 
     203                 :            : }

Generated by: LCOV version 1.11