LCOV - code coverage report
Current view: top level - weight - dlhweight.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 7822d31adece Lines: 52 61 85.2 %
Date: 2019-05-23 11:15:29 Functions: 9 11 81.8 %
Branches: 22 38 57.9 %

           Branch data     Line data    Source code
       1                 :            : /** @file dlhweight.cc
       2                 :            :  * @brief Xapian::DLHWeight class - The DLH weighting scheme of the DFR framework.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2013, 2014 Aarsh Shah
       5                 :            :  * Copyright (C) 2016,2017 Olly Betts
       6                 :            :  *
       7                 :            :  * This program is free software; you can redistribute it and/or
       8                 :            :  * modify it under the terms of the GNU General Public License as
       9                 :            :  * published by the Free Software Foundation; either version 2 of the
      10                 :            :  * License, or (at your option) any later version.
      11                 :            :  *
      12                 :            :  * This program is distributed in the hope that it will be useful
      13                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      14                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
      15                 :            :  * GNU General Public License for more details.
      16                 :            :  *
      17                 :            :  * You should have received a copy of the GNU General Public License
      18                 :            :  * along with this program; if not, write to the Free Software
      19                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
      20                 :            :  */
      21                 :            : 
      22                 :            : #include <config.h>
      23                 :            : 
      24                 :            : #include "xapian/weight.h"
      25                 :            : 
      26                 :            : #include "xapian/error.h"
      27                 :            : #include "common/log2.h"
      28                 :            : #include <algorithm>
      29                 :            : 
      30                 :            : using namespace std;
      31                 :            : 
      32                 :            : namespace Xapian {
      33                 :            : 
      34                 :            : DLHWeight *
      35                 :         57 : DLHWeight::clone() const
      36                 :            : {
      37                 :         57 :     return new DLHWeight();
      38                 :            : }
      39                 :            : 
      40                 :            : void
      41                 :         40 : DLHWeight::init(double factor)
      42                 :            : {
      43         [ +  + ]:         40 :     if (factor == 0.0) {
      44                 :            :         // This object is for the term-independent contribution, and that's
      45                 :            :         // always zero for this scheme.
      46                 :         20 :         return;
      47                 :            :     }
      48                 :            : 
      49                 :         20 :     double wdf_upper = get_wdf_upper_bound();
      50         [ -  + ]:         20 :     if (wdf_upper == 0) {
      51                 :          0 :         upper_bound = 0.0;
      52                 :          0 :         return;
      53                 :            :     }
      54                 :            : 
      55                 :         20 :     const double wdf_lower = 1.0;
      56                 :         20 :     double len_upper = get_doclength_upper_bound();
      57                 :         20 :     double len_lower = get_doclength_lower_bound();
      58                 :            : 
      59                 :         20 :     double N = get_collection_size();
      60                 :         20 :     double F = get_collection_freq();
      61                 :            : 
      62                 :            :     // Calculate constant values to be used in get_sumpart().
      63                 :         20 :     log_constant = get_average_length() * N / F;
      64                 :         20 :     wqf_product_factor = get_wqf() * factor;
      65                 :            : 
      66                 :            :     // Calculate values for the upper bound.
      67                 :            : 
      68                 :            :     // w <= l, so if the allowed ranges overlap, max w/l is 1.0.
      69         [ +  + ]:         20 :     double max_wdf_over_l = wdf_upper < len_lower ? wdf_upper / len_lower : 1.0;
      70                 :            : 
      71                 :            :     // First term A: w/(w+.5)*log2(w/l*L) where L=l_avg.N/coll_freq
      72                 :            :     // Assume log >= 0:
      73                 :            :     //   w/(w+.5) = 1-1/(2w+1) and w >= 1 so max A at w=w_max
      74                 :            :     //   log2(w/l*L) maximised when w/l maximised
      75                 :            :     //   so max A at w=w_max, l=max(l_min, w_max)
      76                 :            :     // If log < 0 => then A <= 0, so max A <= 0 and we want to minimise the
      77                 :            :     // factor outside the log.
      78                 :         20 :     double logged_expr = max_wdf_over_l * log_constant;
      79         [ +  + ]:         20 :     double w_for_A = logged_expr > 1.0 ? wdf_upper : wdf_lower;
      80                 :         20 :     double A = w_for_A / (w_for_A + 0.5) * log2(logged_expr);
      81                 :            : 
      82                 :            :     // Second term B:
      83                 :            :     //
      84                 :            :     // (l-w)*log2(1-w/l)
      85                 :            :     //
      86                 :            :     // The log is negative, and w <= l so B <= 0 and its maximum is the value
      87                 :            :     // as close to zero as possible.  So smaller (l-w) is better and smaller
      88                 :            :     // w/l is better.
      89                 :            :     //
      90                 :            :     // This function is ill defined at w=l, but -> 0 as w -> l.
      91                 :            :     //
      92                 :            :     // If w=l is valid (i.e. len_lower > wdf_upper) then B = 0.
      93                 :         20 :     double B = 0;
      94         [ +  + ]:         20 :     if (len_lower > wdf_upper) {
      95                 :            :         // If not, then minimising l-w gives us a candidate (i.e. w=wdf_upper
      96                 :            :         // and l=len_lower).
      97                 :            :         //
      98                 :            :         // The function is also 0 at w = 0 (there must be a local mimina at
      99                 :            :         // some value of w between 0 and l), so the other candidate is at
     100                 :            :         // w=wdf_lower.
     101                 :            :         //
     102                 :            :         // We need to find the optimum value of l in this case, so
     103                 :            :         // differentiate the formula by l:
     104                 :            :         //
     105                 :            :         // d/dl: log2(1-w/l) + (l-w)*(1-w/l)/(l*log(2))
     106                 :            :         //     = (log(1-w/l) + (1-w/l)²)/log(2)
     107                 :            :         //     = (log(x) + x²)/log(2)     [x=1-w/l]
     108                 :            :         //
     109                 :            :         // which is 0 at approx x=0.65291864
     110                 :            :         //
     111                 :            :         // x=1-w/l <=> l*(1-x)=w <=> l=w/(1-x) <=> l ~= 0.34708136*w
     112                 :            :         //
     113                 :            :         // but l >= w so we can't attain that (and the log isn't valid there).
     114                 :            :         //
     115                 :            :         // Gradient at (without loss of generality) l=2*w is:
     116                 :            :         //       (log(0.5) + 0.25)/log(2)
     117                 :            :         // which is < 0 so want to minimise l, i.e. l=len_lower, so the other
     118                 :            :         // candidate is w=wdf_lower and l=len_lower.
     119                 :            :         //
     120                 :            :         // So evaluate both candidates and pick the larger:
     121                 :         14 :         double B1 = (len_lower - wdf_lower) * log2(1.0 - wdf_lower / len_lower);
     122                 :         14 :         double B2 = (len_lower - wdf_upper) * log2(1.0 - wdf_upper / len_lower);
     123                 :         14 :         B = max(B1, B2);
     124                 :            :     }
     125                 :            : 
     126                 :            :     /* An upper bound of the term used in the third log can be obtained by:
     127                 :            :      *
     128                 :            :      * 0.5 * log2(2.0 * M_PI * wdf * (1 - wdf / len))
     129                 :            :      *
     130                 :            :      * An upper bound on wdf * (1 - wdf / len) (and hence on the log, since
     131                 :            :      * log is a monotonically increasing function on positive numbers) can
     132                 :            :      * be obtained by plugging in the upper bound of the length and
     133                 :            :      * differentiating the term w.r.t wdf which gives the value of wdf at which
     134                 :            :      * the function attains maximum value - at wdf = len_upper / 2 (or if the
     135                 :            :      * wdf can't be that large, at wdf = wdf_upper): */
     136                 :         20 :     double wdf_var = min(wdf_upper, len_upper / 2.0);
     137                 :         20 :     double max_product = wdf_var * (1.0 - wdf_var / len_upper);
     138                 :            : #if 0
     139                 :            :     /* An upper bound can also be obtained by taking the minimum and maximum
     140                 :            :      * wdf value in the formula as shown (which isn't useful now as it's always
     141                 :            :      * >= the bound above, but could be useful if we tracked bounds on wdf/len):
     142                 :            :      */
     143                 :            :     double min_wdf_to_len = wdf_lower / len_upper;
     144                 :            :     double max_product_2 = wdf_upper * (1.0 - min_wdf_to_len);
     145                 :            :     /* Take the minimum of the two upper bounds. */
     146                 :            :     max_product = min(max_product, max_product_2);
     147                 :            : #endif
     148                 :         20 :     double C = 0.5 * log2(2.0 * M_PI * max_product) / (wdf_lower + 0.5);
     149                 :         20 :     upper_bound = A + B + C;
     150                 :            : 
     151         [ -  + ]:         20 :     if (rare(upper_bound < 0.0))
     152                 :          0 :         upper_bound = 0.0;
     153                 :            :     else
     154                 :         20 :         upper_bound *= wqf_product_factor;
     155                 :            : }
     156                 :            : 
     157                 :            : string
     158                 :       1452 : DLHWeight::name() const
     159                 :            : {
     160         [ +  - ]:       1452 :     return "Xapian::DLHWeight";
     161                 :            : }
     162                 :            : 
     163                 :            : string
     164                 :       1447 : DLHWeight::short_name() const
     165                 :            : {
     166         [ +  - ]:       1447 :     return "dlh";
     167                 :            : }
     168                 :            : 
     169                 :            : string
     170                 :          8 : DLHWeight::serialise() const
     171                 :            : {
     172                 :          8 :     return string();
     173                 :            : }
     174                 :            : 
     175                 :            : DLHWeight *
     176                 :          6 : DLHWeight::unserialise(const string& s) const
     177                 :            : {
     178         [ +  + ]:          6 :     if (rare(!s.empty()))
     179 [ +  - ][ +  - ]:          1 :         throw Xapian::SerialisationError("Extra data in DLHWeight::unserialise()");
                 [ +  - ]
     180                 :          5 :     return new DLHWeight();
     181                 :            : }
     182                 :            : 
     183                 :            : double
     184                 :         45 : DLHWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
     185                 :            :                        Xapian::termcount) const
     186                 :            : {
     187 [ +  - ][ +  + ]:         45 :     if (wdf == 0 || wdf == len) return 0.0;
     188                 :            : 
     189                 :         42 :     double wdf_to_len = double(wdf) / len;
     190                 :         42 :     double one_minus_wdf_to_len = 1.0 - wdf_to_len;
     191                 :            : 
     192                 :         84 :     double wt = wdf * log2(wdf_to_len * log_constant) +
     193                 :         42 :                 (len - wdf) * log2(one_minus_wdf_to_len) +
     194                 :         42 :                 0.5 * log2(2.0 * M_PI * wdf * one_minus_wdf_to_len);
     195         [ +  + ]:         42 :     if (rare(wt <= 0.0)) return 0.0;
     196                 :            : 
     197                 :         28 :     return wqf_product_factor * wt / (wdf + 0.5);
     198                 :            : }
     199                 :            : 
     200                 :            : double
     201                 :         40 : DLHWeight::get_maxpart() const
     202                 :            : {
     203                 :         40 :     return upper_bound;
     204                 :            : }
     205                 :            : 
     206                 :            : double
     207                 :          0 : DLHWeight::get_sumextra(Xapian::termcount, Xapian::termcount) const
     208                 :            : {
     209                 :          0 :     return 0;
     210                 :            : }
     211                 :            : 
     212                 :            : double
     213                 :         20 : DLHWeight::get_maxextra() const
     214                 :            : {
     215                 :         20 :     return 0;
     216                 :            : }
     217                 :            : 
     218                 :            : DLHWeight *
     219                 :          0 : DLHWeight::create_from_parameters(const char * p) const
     220                 :            : {
     221         [ #  # ]:          0 :     if (*p != '\0')
     222 [ #  # ][ #  # ]:          0 :         throw InvalidArgumentError("No parameters are required for DLHWeight");
                 [ #  # ]
     223                 :          0 :     return new Xapian::DLHWeight();
     224                 :            : }
     225                 :            : 
     226                 :            : }

Generated by: LCOV version 1.11