LCOV - code coverage report
Current view: top level - weight - pl2plusweight.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core eba1a2e3082b Lines: 66 82 80.5 %
Date: 2019-06-13 13:35:36 Functions: 10 12 83.3 %
Branches: 33 84 39.3 %

           Branch data     Line data    Source code
       1                 :            : /** @file pl2plusweight.cc
       2                 :            :  * @brief Xapian::PL2PlusWeight class - the PL2+ weighting scheme of the DFR framework.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2013 Aarsh Shah
       5                 :            :  * Copyright (C) 2013,2014,2016,2017 Olly Betts
       6                 :            :  * Copyright (C) 2016 Vivek Pal
       7                 :            :  *
       8                 :            :  * This program is free software; you can redistribute it and/or
       9                 :            :  * modify it under the terms of the GNU General Public License as
      10                 :            :  * published by the Free Software Foundation; either version 2 of the
      11                 :            :  * License, or (at your option) any later version.
      12                 :            :  *
      13                 :            :  * This program is distributed in the hope that it will be useful
      14                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      15                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
      16                 :            :  * GNU General Public License for more details.
      17                 :            :  *
      18                 :            :  * You should have received a copy of the GNU General Public License
      19                 :            :  * along with this program; if not, write to the Free Software
      20                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
      21                 :            :  */
      22                 :            : 
      23                 :            : #include <config.h>
      24                 :            : 
      25                 :            : #include "xapian/weight.h"
      26                 :            : #include "common/log2.h"
      27                 :            : #include "weightinternal.h"
      28                 :            : 
      29                 :            : #include "serialise-double.h"
      30                 :            : 
      31                 :            : #include "xapian/error.h"
      32                 :            : 
      33                 :            : #include <algorithm>
      34                 :            : 
      35                 :            : using namespace std;
      36                 :            : 
      37                 :            : namespace Xapian {
      38                 :            : 
      39                 :        104 : PL2PlusWeight::PL2PlusWeight(double c, double delta)
      40                 :        106 :         : param_c(c), param_delta(delta)
      41                 :            : {
      42         [ +  + ]:        104 :     if (param_c <= 0)
      43 [ +  - ][ +  - ]:          1 :         throw Xapian::InvalidArgumentError("Parameter c is invalid.");
                 [ +  - ]
      44         [ +  + ]:        103 :     if (param_delta <= 0)
      45 [ +  - ][ +  - ]:          1 :         throw Xapian::InvalidArgumentError("Parameter delta is invalid.");
                 [ +  - ]
      46                 :        102 :     need_stat(AVERAGE_LENGTH);
      47                 :        102 :     need_stat(DOC_LENGTH);
      48                 :        102 :     need_stat(DOC_LENGTH_MIN);
      49                 :        102 :     need_stat(DOC_LENGTH_MAX);
      50                 :        102 :     need_stat(COLLECTION_SIZE);
      51                 :        102 :     need_stat(COLLECTION_FREQ);
      52                 :        102 :     need_stat(WDF);
      53                 :        102 :     need_stat(WDF_MAX);
      54                 :        102 :     need_stat(WQF);
      55                 :        102 : }
      56                 :            : 
      57                 :            : PL2PlusWeight *
      58                 :         69 : PL2PlusWeight::clone() const
      59                 :            : {
      60         [ +  - ]:         69 :     return new PL2PlusWeight(param_c, param_delta);
      61                 :            : }
      62                 :            : 
      63                 :            : void
      64                 :         48 : PL2PlusWeight::init(double factor_)
      65                 :            : {
      66         [ +  + ]:         48 :     if (factor_ == 0.0) {
      67                 :            :         // This object is for the term-independent contribution, and that's
      68                 :            :         // always zero for this scheme.
      69                 :         24 :         return;
      70                 :            :     }
      71                 :            : 
      72                 :         24 :     factor = factor_;
      73                 :            : 
      74         [ -  + ]:         24 :     if (get_wdf_upper_bound() == 0) {
      75                 :            :         // The "extra" weight object is cloned, init() called and then
      76                 :            :         // get_maxextra() is called and we discover that we don't need it.
      77                 :            :         // So we need to handle that case (which will give us 0 from
      78                 :            :         // get_wdf_upper_bound() here).
      79                 :          0 :         upper_bound = 0;
      80                 :          0 :         return;
      81                 :            :     }
      82                 :            : 
      83                 :         24 :     factor *= get_wqf();
      84                 :            : 
      85                 :         24 :     cl = param_c * get_average_length();
      86                 :            : 
      87                 :         24 :     double base_change(1.0 / log(2.0));
      88                 :         24 :     mean = double(get_collection_freq()) / get_collection_size();
      89                 :         24 :     P1 = mean * base_change + 0.5 * log2(2.0 * M_PI);
      90                 :         24 :     P2 = log2(mean) + base_change;
      91                 :            : 
      92                 :         24 :     double wdfn_lower = log2(1 + cl / get_doclength_upper_bound());
      93         [ +  - ]:         24 :     double divisior = max(get_wdf_upper_bound(), get_doclength_lower_bound());
      94                 :         24 :     double wdfn_upper = get_wdf_upper_bound() * log2(1 + cl / divisior);
      95                 :            : 
      96                 :         24 :     double P_delta = P1 + (param_delta + 0.5) * log2(param_delta) - P2 * param_delta;
      97                 :         24 :     dw = P_delta / (param_delta + 1.0);
      98                 :            : 
      99                 :            :     // Calculate an upper bound on the weights which get_sumpart() can return.
     100                 :            :     //
     101                 :            :     // We consider the equation for P as the sum of two parts which we
     102                 :            :     // maximise individually:
     103                 :            :     //
     104                 :            :     // (a) (wdfn + 0.5) / (wdfn + 1) * log2(wdfn)
     105                 :            :     // (b) (P1 - P2 * wdfn) / (wdfn + 1)
     106                 :            :     //
     107                 :            :     // To maximise (a), the fractional part is always positive (since wdfn>0)
     108                 :            :     // and is maximised by maximising wdfn - clearer when rewritten as:
     109                 :            :     // (1 - 0.5 / (wdfn + 1))
     110                 :            :     //
     111                 :            :     // The log part of (a) is clearly also maximised by maximising wdfn,
     112                 :            :     // so we want to evaluate (a) at wdfn=wdfn_upper.
     113                 :         24 :     double P_max2a = (wdfn_upper + 0.5) * log2(wdfn_upper) / (wdfn_upper + 1.0);
     114                 :            :     // To maximise (b) substitute x=wdfn+1 (so x>1) and we get:
     115                 :            :     //
     116                 :            :     // (P1 + P2)/x - P2
     117                 :            :     //
     118                 :            :     // Differentiating wrt x gives:
     119                 :            :     //
     120                 :            :     // -(P1 + P2)/x²
     121                 :            :     //
     122                 :            :     // So there are no local minima or maxima, and the function is continuous
     123                 :            :     // in the range of interest, so the sign of this differential tells us
     124                 :            :     // whether we want to maximise or minimise wdfn, and since x>1, we can
     125                 :            :     // just consider the sign of: (P1 + P2)
     126                 :            :     //
     127                 :            :     // Commonly P1 + P2 > 0, in which case we evaluate P at wdfn=wdfn_upper
     128                 :            :     // giving us a bound that can't be bettered if wdfn_upper is tight.
     129         [ +  - ]:         24 :     double wdfn_optb = P1 + P2 > 0 ? wdfn_upper : wdfn_lower;
     130                 :         24 :     double P_max2b = (P1 - P2 * wdfn_optb) / (wdfn_optb + 1.0);
     131                 :         24 :     upper_bound = factor * (P_max2a + P_max2b + dw);
     132                 :            : 
     133         [ -  + ]:         48 :     if (rare(upper_bound <= 0)) upper_bound = 0;
     134                 :            : }
     135                 :            : 
     136                 :            : string
     137                 :       1454 : PL2PlusWeight::name() const
     138                 :            : {
     139         [ +  - ]:       1454 :     return "Xapian::PL2PlusWeight";
     140                 :            : }
     141                 :            : 
     142                 :            : string
     143                 :       1447 : PL2PlusWeight::short_name() const
     144                 :            : {
     145         [ +  - ]:       1447 :     return "pl2plus";
     146                 :            : }
     147                 :            : 
     148                 :            : string
     149                 :         18 : PL2PlusWeight::serialise() const
     150                 :            : {
     151                 :         18 :     string result = serialise_double(param_c);
     152 [ +  - ][ +  - ]:         18 :     result += serialise_double(param_delta);
     153                 :         18 :     return result;
     154                 :            : }
     155                 :            : 
     156                 :            : PL2PlusWeight *
     157                 :          8 : PL2PlusWeight::unserialise(const string & s) const
     158                 :            : {
     159                 :          8 :     const char *ptr = s.data();
     160                 :          8 :     const char *end = ptr + s.size();
     161         [ +  - ]:          8 :     double c = unserialise_double(&ptr, end);
     162         [ +  - ]:          8 :     double delta = unserialise_double(&ptr, end);
     163         [ +  + ]:          8 :     if (rare(ptr != end))
     164 [ +  - ][ +  - ]:          1 :         throw Xapian::SerialisationError("Extra data in PL2PlusWeight::unserialise()");
                 [ +  - ]
     165 [ +  - ][ +  - ]:          7 :     return new PL2PlusWeight(c, delta);
     166                 :            : }
     167                 :            : 
     168                 :            : double
     169                 :         63 : PL2PlusWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
     170                 :            :                            Xapian::termcount) const
     171                 :            : {
     172 [ +  - ][ -  + ]:         63 :     if (wdf == 0 || mean < 1) return 0.0;
     173                 :            : 
     174                 :         63 :     double wdfn = wdf * log2(1 + cl / len);
     175                 :            : 
     176                 :         63 :     double P = P1 + (wdfn + 0.5) * log2(wdfn) - P2 * wdfn;
     177                 :            : 
     178                 :         63 :     double wt = (P / (wdfn + 1.0)) + dw;
     179                 :            :     // FIXME: Is a negative wt possible here?  It is with vanilla PL2, but for
     180                 :            :     // PL2+ we've added on dw, and bailed out early if mean < 1.
     181         [ -  + ]:         63 :     if (rare(wt <= 0)) return 0.0;
     182                 :            : 
     183                 :         63 :     return factor * wt;
     184                 :            : }
     185                 :            : 
     186                 :            : double
     187                 :         48 : PL2PlusWeight::get_maxpart() const
     188                 :            : {
     189                 :         48 :     return upper_bound;
     190                 :            : }
     191                 :            : 
     192                 :            : double
     193                 :          0 : PL2PlusWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
     194                 :            : {
     195                 :          0 :     return 0;
     196                 :            : }
     197                 :            : 
     198                 :            : double
     199                 :         24 : PL2PlusWeight::get_maxextra() const
     200                 :            : {
     201                 :         24 :     return 0;
     202                 :            : }
     203                 :            : 
     204                 :            : PL2PlusWeight *
     205                 :          0 : PL2PlusWeight::create_from_parameters(const char * p) const
     206                 :            : {
     207         [ #  # ]:          0 :     if (*p == '\0')
     208         [ #  # ]:          0 :         return new Xapian::PL2PlusWeight();
     209                 :          0 :     double k = 1.0;
     210                 :          0 :     double delta = 0.8;
     211         [ #  # ]:          0 :     if (!Xapian::Weight::Internal::double_param(&p, &k))
     212 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter is invalid", "pl2pls");
     213         [ #  # ]:          0 :     if (!Xapian::Weight::Internal::double_param(&p, &delta))
     214 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter is invalid", "pl2plus");
     215         [ #  # ]:          0 :     if (*p)
     216 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Extra data after parameter", "pl2plus");
     217 [ #  # ][ #  # ]:          0 :     return new Xapian::PL2PlusWeight(k, delta);
     218                 :            : }
     219                 :            : 
     220                 :            : }

Generated by: LCOV version 1.11