LCOV - code coverage report
Current view: top level - weight - lmweight.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core eba1a2e3082b Lines: 81 110 73.6 %
Date: 2019-06-13 13:35:36 Functions: 10 12 83.3 %
Branches: 53 130 40.8 %

           Branch data     Line data    Source code
       1                 :            : /** @file lmweight.cc
       2                 :            :  * @brief Xapian::LMWeight class - the Unigram Language Modelling formula.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2012 Gaurav Arora
       5                 :            :  * Copyright (C) 2016 Olly Betts
       6                 :            :  * Copyright (C) 2016 Vivek Pal
       7                 :            :  *
       8                 :            :  * This program is free software; you can redistribute it and/or
       9                 :            :  * modify it under the terms of the GNU General Public License as
      10                 :            :  * published by the Free Software Foundation; either version 2 of the
      11                 :            :  * License, or (at your option) any later version.
      12                 :            :  *
      13                 :            :  * This program is distributed in the hope that it will be useful
      14                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      15                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
      16                 :            :  * GNU General Public License for more details.
      17                 :            :  *
      18                 :            :  * You should have received a copy of the GNU General Public License
      19                 :            :  * along with this program; if not, write to the Free Software
      20                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
      21                 :            :  */
      22                 :            : 
      23                 :            : #include <config.h>
      24                 :            : 
      25                 :            : #include "xapian/weight.h"
      26                 :            : #include "weightinternal.h"
      27                 :            : 
      28                 :            : #include "debuglog.h"
      29                 :            : #include "omassert.h"
      30                 :            : #include "serialise-double.h"
      31                 :            : 
      32                 :            : #include "xapian/error.h"
      33                 :            : 
      34                 :            : #include <cerrno>
      35                 :            : #include <cmath>
      36                 :            : #include <cstdlib>
      37                 :            : 
      38                 :            : using namespace std;
      39                 :            : 
      40                 :            : namespace Xapian {
      41                 :            : 
      42                 :            : LMWeight *
      43                 :        253 : LMWeight::clone() const {
      44                 :        253 :     return new LMWeight(param_log, select_smoothing, param_smoothing1, param_smoothing2);
      45                 :            : }
      46                 :            : 
      47                 :            : void
      48                 :        176 : LMWeight::init(double factor_)
      49                 :            : {
      50                 :        176 :     factor = factor_;
      51                 :            : 
      52                 :            :     // Storing collection frequency of current term in collection_freq to be
      53                 :            :     // accessed while smoothing of weights for the term, for term not present
      54                 :            :     // in the document.
      55                 :        176 :     double collection_freq = get_collection_freq();
      56                 :            : 
      57                 :            :     // Collection_freq of a term in collection should be always greater than or
      58                 :            :     // equal to zero (Non Negative).
      59                 :            :     AssertRel(collection_freq,>=,0);
      60                 :            :     LOGVALUE(WTCALC, collection_freq);
      61                 :            : 
      62                 :            :     // calculating approximate number of total terms in the collection to be
      63                 :            :     // accessed for smoothing of the document.
      64                 :        176 :     double total_collection_term = get_collection_size() * get_average_length();
      65                 :            : 
      66                 :            :     /* In case the within document frequency of term is zero smoothing will
      67                 :            :      * be required and should be return instead of returning zero, as returning
      68                 :            :      * LM score are multiplication of contribution of all terms, due to absence
      69                 :            :      * of single term whole document is scored zero, hence apply collection
      70                 :            :      * frequency smoothing.
      71                 :            :      */
      72                 :        176 :     weight_collection = double(collection_freq) / total_collection_term;
      73                 :            : 
      74                 :            :     // Total term should be greater than zero as there would be at least one
      75                 :            :     // document in collection.
      76                 :            :     AssertRel(total_collection_term,>,0);
      77                 :            :     LOGVALUE(WTCALC, total_collection_term);
      78                 :            : 
      79                 :            :     // There can't be more relevant term in collection than total number of
      80                 :            :     // term.
      81                 :            :     AssertRel(collection_freq,<=,total_collection_term);
      82                 :            : 
      83                 :            :     /* Setting default values of the param_log to handle negative value of log.
      84                 :            :      * It is considered to be upperbound of document length.
      85                 :            :      * initializing param_log to upperbound of document_length.
      86                 :            :      */
      87                 :            : 
      88         [ +  + ]:        176 :     if (param_log == 0.0) {
      89                 :        112 :         param_log = get_doclength_upper_bound();
      90                 :            :     }
      91                 :            : 
      92                 :            :     /* Since the optimal parameter for Jelinek mercer smoothing
      93                 :            :      * is based on query length, so if query is title query changing
      94                 :            :      * default value of smoothing parameter.
      95                 :            :      */
      96                 :            : 
      97 [ +  + ][ +  + ]:        176 :     if (select_smoothing == JELINEK_MERCER_SMOOTHING ||
      98                 :        144 :         select_smoothing == TWO_STAGE_SMOOTHING) {
      99         [ +  + ]:         80 :         if (param_smoothing1 == 0.7) {
     100         [ +  - ]:         16 :             if (get_query_length() <= 2) {
     101                 :         16 :                 param_smoothing1 = 0.1;
     102                 :            :             }
     103                 :            :         }
     104                 :            :     }
     105                 :            : 
     106                 :            :     /* param_smoothing1 default value should be 2000 in case
     107                 :            :      * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1
     108                 :            :      * if user supply his own value for param_smoothing1 value will not be set
     109                 :            :      * to 2000(default value)
     110                 :            :      */
     111         [ +  + ]:        176 :     if (select_smoothing == DIRICHLET_SMOOTHING) {
     112         [ -  + ]:         64 :         if (param_smoothing1 == 0.7) {
     113                 :          0 :             param_smoothing1 = 2000;
     114                 :            :         }
     115                 :            :     }
     116                 :            : 
     117                 :            :     /* Setting param_smoothing1 and param_smoothing2 default value to used when
     118                 :            :      * DIRICHLET_PLUS_SMOOTHING is selected.*/
     119         [ +  + ]:        176 :     if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
     120         [ -  + ]:         16 :         if (param_smoothing1 == 0.7) {
     121                 :          0 :             param_smoothing1 = 2000;
     122                 :            :         }
     123                 :            :     }
     124                 :        176 : }
     125                 :            : 
     126                 :            : string
     127                 :       1470 : LMWeight::name() const
     128                 :            : {
     129         [ +  - ]:       1470 :     return "Xapian::LMWeight";
     130                 :            : }
     131                 :            : 
     132                 :            : string
     133                 :       1447 : LMWeight::short_name() const
     134                 :            : {
     135         [ +  - ]:       1447 :     return "lm";
     136                 :            : }
     137                 :            : 
     138                 :            : string
     139                 :         29 : LMWeight::serialise() const
     140                 :            : {
     141                 :         29 :     string result = serialise_double(param_log);
     142         [ +  - ]:         29 :     result += static_cast<unsigned char>(select_smoothing);
     143 [ +  - ][ +  - ]:         29 :     result += serialise_double(param_smoothing1);
     144 [ +  - ][ +  - ]:         29 :     result += serialise_double(param_smoothing2);
     145                 :         29 :     return result;
     146                 :            : }
     147                 :            : 
     148                 :            : LMWeight *
     149                 :         25 : LMWeight::unserialise(const string & s) const
     150                 :            : {
     151                 :         25 :     const char *ptr = s.data();
     152                 :         25 :     const char *end = ptr + s.size();
     153         [ +  - ]:         25 :     double param_log_ = unserialise_double(&ptr, end);
     154                 :         25 :     type_smoothing select_smoothing_ = static_cast<type_smoothing>(*(ptr)++);
     155         [ +  - ]:         25 :     double param_smoothing1_ = unserialise_double(&ptr, end);
     156         [ +  - ]:         25 :     double param_smoothing2_ = unserialise_double(&ptr, end);
     157         [ +  + ]:         25 :     if (rare(ptr != end))
     158 [ +  - ][ +  - ]:          2 :         throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
                 [ +  - ]
     159         [ +  - ]:         23 :     return new LMWeight(param_log_, select_smoothing_, param_smoothing1_, param_smoothing2_);
     160                 :            : }
     161                 :            : 
     162                 :            : double
     163                 :       3227 : LMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
     164                 :            :                       Xapian::termcount uniqterm) const
     165                 :            : {
     166                 :            :     // Within Document Frequency of the term in document being considered.
     167                 :       3227 :     double wdf_double = wdf;
     168                 :            :     // Length of the Document in terms of number of terms.
     169                 :       3227 :     double len_double = len;
     170                 :            :     // variable to store weight contribution of term in the document scoring for LM.
     171                 :            :     double weight_sum;
     172                 :            : 
     173                 :            :     // Calculating weights considering different smoothing option available to user.
     174         [ +  + ]:       3227 :     if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
     175                 :            :         /* Maximum likelihood of current term, weight contribution of term in
     176                 :            :          * case query term is present in the document.
     177                 :            :          */
     178                 :         70 :         double weight_document = wdf_double / len_double;
     179                 :         70 :         weight_sum = (param_smoothing1 * weight_collection) +
     180                 :         70 :                      ((1 - param_smoothing1) * weight_document);
     181         [ +  + ]:       3157 :     } else if (select_smoothing == DIRICHLET_SMOOTHING) {
     182                 :        140 :         weight_sum = (wdf_double + (param_smoothing1 * weight_collection)) /
     183                 :        140 :                      (len_double + param_smoothing1);
     184         [ +  + ]:       3017 :     } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
     185                 :            :         /* In the Dir+ weighting formula, sumpart weight contribution is :-
     186                 :            :          *
     187                 :            :          * sum of log of (1 + (wdf/(param_smoothing1 * weight_collection))) and
     188                 :            :          * log of (1 + (delta/param_smoothing1 * weight_collection))).
     189                 :            :          * Since, sum of logs is log of product so weight_sum is calculated as product
     190                 :            :          * of terms in log in the Dir+ formula.
     191                 :            :          */
     192                 :         35 :         weight_sum = (1 + (wdf_double / (param_smoothing1 * weight_collection))) *
     193                 :         35 :                      (1 + (param_smoothing2 / (param_smoothing1 * weight_collection)));
     194         [ +  + ]:       2982 :     } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
     195                 :         35 :         double uniqterm_double = uniqterm;
     196         [ +  - ]:         35 :         weight_sum = ((((wdf_double - param_smoothing1) > 0) ? (wdf_double - param_smoothing1) : 0) / len_double) + ((param_smoothing1 * weight_collection * uniqterm_double) / len_double);
     197                 :            :     } else {
     198                 :       2947 :         weight_sum = (((1 - param_smoothing1) * (wdf_double + (param_smoothing2 * weight_collection)) / (len_double + param_smoothing2)) + (param_smoothing1 * weight_collection));
     199                 :            :     }
     200                 :            : 
     201                 :            :     /* Since LM score is calculated with multiplication, instead of changing
     202                 :            :      * the current implementation log trick have been used to calculate the
     203                 :            :      * product since (sum of log is log of product and since aim is ranking
     204                 :            :      * ranking document by product or log of product won't make a large
     205                 :            :      * difference hence log(product) will be used for ranking.
     206                 :            :      */
     207                 :       3227 :     double product = weight_sum * param_log;
     208         [ +  - ]:       3227 :     return (product > 1.0) ? factor * log(product) : 0;
     209                 :            : }
     210                 :            : 
     211                 :            : double
     212                 :        177 : LMWeight::get_maxpart() const
     213                 :            : {
     214                 :            :     // Variable to store the collection frequency
     215                 :            :     double upper_bound;
     216                 :            :     // Store upper bound on wdf in variable wdf_max
     217                 :        177 :     double wdf_max = get_wdf_upper_bound();
     218                 :            : 
     219                 :            :     // Calculating upper bound considering different smoothing option available to user.
     220         [ +  + ]:        177 :     if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
     221                 :         32 :         upper_bound = (param_smoothing1 * weight_collection) + (1 - param_smoothing1);
     222         [ +  + ]:        145 :     } else if (select_smoothing == DIRICHLET_SMOOTHING) {
     223                 :         64 :         upper_bound = (get_doclength_upper_bound() + (param_smoothing1 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing1);
     224         [ +  + ]:         81 :     } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
     225                 :         16 :         upper_bound = (1 + (wdf_max / (param_smoothing1 * weight_collection))) *
     226                 :         16 :                       (1 + (param_smoothing2 / (param_smoothing1 * weight_collection)));
     227         [ +  + ]:         65 :     } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
     228                 :         16 :         upper_bound = param_smoothing1 * weight_collection + 1;
     229                 :            :     } else {
     230                 :         49 :         upper_bound = (((1 - param_smoothing1) * (get_doclength_upper_bound() + (param_smoothing2 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing2)) + (param_smoothing1 * weight_collection));
     231                 :            :     }
     232                 :            : 
     233                 :            :     /* Since weight are calculated using log trick, using same with the bounds. Refer
     234                 :            :      * comment in get_sumpart for the details.
     235                 :            :      */
     236                 :        177 :     double product = upper_bound * param_log;
     237         [ +  - ]:        177 :     return (product > 1.0) ? factor * log(product) : 1.0;
     238                 :            : }
     239                 :            : 
     240                 :            : /* The extra weight component in the Dir+ formula is :-
     241                 :            :  *
     242                 :            :  * |Q| * log (param_smoothing1 / (|D| + param_smoothing1))
     243                 :            :  *
     244                 :            :  * where, |Q| is total query length.
     245                 :            :  *        |D| is total document length.
     246                 :            :  */
     247                 :            : double
     248                 :         35 : LMWeight::get_sumextra(Xapian::termcount len, Xapian::termcount) const
     249                 :            : {
     250         [ +  - ]:         35 :     if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
     251                 :         35 :         double extra_weight = param_smoothing1 / (len + param_smoothing1);
     252                 :         35 :         return get_query_length() * log(extra_weight);
     253                 :            :     }
     254                 :          0 :     return 0;
     255                 :            : }
     256                 :            : 
     257                 :            : double
     258                 :         96 : LMWeight::get_maxextra() const
     259                 :            : {
     260         [ +  + ]:         96 :     if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
     261                 :         16 :         double extra_weight = param_smoothing1 / (get_doclength_lower_bound() + param_smoothing1);
     262                 :         16 :         return get_query_length() * log(extra_weight);
     263                 :            :     }
     264                 :         80 :     return 0;
     265                 :            : }
     266                 :            : 
     267                 :            : static bool
     268                 :          0 : type_smoothing_param(const char ** p, Xapian::Weight::type_smoothing * ptr_val)
     269                 :            : {
     270                 :            :     char *end;
     271                 :          0 :     errno = 0;
     272                 :          0 :     int v = strtol(*p, &end, 10);
     273 [ #  # ][ #  # ]:          0 :     if (*p == end || errno || v < 1 || v > 5)
         [ #  # ][ #  # ]
     274                 :          0 :         return false;
     275                 :          0 :     *p = end;
     276                 :            :     static const Xapian::Weight::type_smoothing smooth_tab[5] = {
     277                 :            :         Xapian::Weight::TWO_STAGE_SMOOTHING,
     278                 :            :         Xapian::Weight::DIRICHLET_SMOOTHING,
     279                 :            :         Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING,
     280                 :            :         Xapian::Weight::JELINEK_MERCER_SMOOTHING,
     281                 :            :         Xapian::Weight::DIRICHLET_PLUS_SMOOTHING
     282                 :            :     };
     283                 :          0 :     *ptr_val = smooth_tab[v - 1];
     284                 :          0 :     return true;
     285                 :            : }
     286                 :            : 
     287                 :            : LMWeight *
     288                 :          0 : LMWeight::create_from_parameters(const char * p) const
     289                 :            : {
     290         [ #  # ]:          0 :     if (*p == '\0')
     291         [ #  # ]:          0 :         return new Xapian::LMWeight();
     292                 :          0 :     double param_log_ = 0;
     293                 :          0 :     Xapian::Weight::type_smoothing type = Xapian::Weight::TWO_STAGE_SMOOTHING;
     294                 :          0 :     double smoothing1 = 0.7;
     295                 :          0 :     double smoothing2 = 2000;
     296         [ #  # ]:          0 :     if (!Xapian::Weight::Internal::double_param(&p, &param_log_))
     297 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 1 (log) is invalid", "lm");
     298 [ #  # ][ #  # ]:          0 :     if (*p && !type_smoothing_param(&p, &type))
                 [ #  # ]
     299 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 2 (smoothing_type) is invalid", "lm");
     300 [ #  # ][ #  # ]:          0 :     if (*p && !Xapian::Weight::Internal::double_param(&p, &smoothing1))
                 [ #  # ]
     301 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 3 (smoothing1) is invalid", "lm");
     302 [ #  # ][ #  # ]:          0 :     if (*p && !Xapian::Weight::Internal::double_param(&p, &smoothing2))
                 [ #  # ]
     303 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Parameter 4 (smoothing2) is invalid", "lm");
     304         [ #  # ]:          0 :     if (*p)
     305 [ #  # ][ #  # ]:          0 :         Xapian::Weight::Internal::parameter_error("Extra data after parameter 4", "lm");
     306         [ #  # ]:          0 :     return new Xapian::LMWeight(param_log_, type, smoothing1, smoothing2);
     307                 :            : }
     308                 :            : 
     309                 :            : }

Generated by: LCOV version 1.11