LCOV - code coverage report
Current view: top level - matcher - nearpostlist.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 954b5873a738 Lines: 53 75 70.7 %
Date: 2019-06-30 05:20:33 Functions: 6 9 66.7 %
Branches: 50 104 48.1 %

           Branch data     Line data    Source code
       1                 :            : /** @file nearpostlist.cc
       2                 :            :  * @brief Return docs containing terms within a specified window.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017,2018 Olly Betts
       5                 :            :  * Copyright (C) 2007 Lemur Consulting Ltd
       6                 :            :  *
       7                 :            :  * This program is free software; you can redistribute it and/or modify
       8                 :            :  * it under the terms of the GNU General Public License as published by
       9                 :            :  * the Free Software Foundation; either version 2 of the License, or
      10                 :            :  * (at your option) any later version.
      11                 :            :  *
      12                 :            :  * This program is distributed in the hope that it will be useful,
      13                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      14                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      15                 :            :  * GNU General Public License for more details.
      16                 :            :  *
      17                 :            :  * You should have received a copy of the GNU General Public License
      18                 :            :  * along with this program; if not, write to the Free Software
      19                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      20                 :            :  */
      21                 :            : 
      22                 :            : #include <config.h>
      23                 :            : 
      24                 :            : #include "nearpostlist.h"
      25                 :            : 
      26                 :            : #include "debuglog.h"
      27                 :            : #include "backends/positionlist.h"
      28                 :            : #include "heap.h"
      29                 :            : #include "omassert.h"
      30                 :            : #include "str.h"
      31                 :            : 
      32                 :            : #include <algorithm>
      33                 :            : #include <vector>
      34                 :            : 
      35                 :            : using namespace std;
      36                 :            : 
      37                 :        220 : NearPostList::NearPostList(PostList *source_,
      38                 :            :                            Xapian::termpos window_,
      39                 :            :                            const vector<PostList*>::const_iterator &terms_begin,
      40                 :            :                            const vector<PostList*>::const_iterator &terms_end,
      41                 :            :                            PostListTree* pltree_)
      42                 :            :     : SelectPostList(source_, pltree_),
      43                 :            :       window(window_),
      44         [ +  - ]:        220 :       terms(terms_begin, terms_end)
      45                 :            : {
      46                 :        220 :     size_t n = terms.size();
      47                 :            :     Assert(n > 1);
      48 [ +  - ][ +  - ]:        220 :     poslists = new PositionList*[n];
      49                 :        220 : }
      50                 :            : 
      51                 :        660 : NearPostList::~NearPostList()
      52                 :            : {
      53         [ +  - ]:        220 :     delete [] poslists;
      54         [ -  + ]:        440 : }
      55                 :            : 
      56                 :            : struct TermCmp {
      57                 :       2704 :     bool operator()(const PostList * a, const PostList * b) const {
      58                 :       2704 :         return a->get_wdf() < b->get_wdf();
      59                 :            :     }
      60                 :            : };
      61                 :            : 
      62                 :            : struct Cmp {
      63                 :       1778 :     bool operator()(const PositionList * a, const PositionList * b) const {
      64                 :       1778 :         return a->get_position() > b->get_position();
      65                 :            :     }
      66                 :            : };
      67                 :            : 
      68                 :            : bool
      69                 :        789 : NearPostList::test_doc()
      70                 :            : {
      71                 :            :     LOGCALL(MATCH, bool, "NearPostList::test_doc", NO_ARGS);
      72                 :            : 
      73                 :            :     // Sort to put least frequent terms first, to try to minimise the number of
      74                 :            :     // position lists we need to read if there are no matches.
      75                 :            :     //
      76                 :            :     // We use wdf as a proxy for the length of the position lists, since we'd
      77                 :            :     // need to read each position list to find its length and we're trying to
      78                 :            :     // avoid having to read them all if we can.
      79         [ +  - ]:        789 :     sort(terms.begin(), terms.end(), TermCmp());
      80                 :            : 
      81         [ +  - ]:        789 :     poslists[0] = terms[0]->read_position_list();
      82 [ +  - ][ +  + ]:        789 :     if (!poslists[0]->next())
      83                 :         12 :         RETURN(false);
      84                 :            : 
      85         [ +  - ]:        777 :     Xapian::termpos last = poslists[0]->get_position();
      86                 :        777 :     PositionList ** end = poslists + 1;
      87                 :            : 
      88                 :            :     while (true) {
      89 [ +  - ][ +  + ]:       2051 :         if (last - poslists[0]->get_position() < window) {
      90         [ +  + ]:       1961 :             if (size_t(end - poslists) != terms.size()) {
      91                 :            :                 // We haven't started all the position lists yet, so start the
      92                 :            :                 // next one.
      93                 :            :                 PositionList * posl =
      94         [ +  - ]:       1352 :                     terms[end - poslists]->read_position_list();
      95         [ +  + ]:       1352 :                 if (last < window) {
      96 [ +  - ][ -  + ]:        443 :                     if (!posl->next())
      97                 :          0 :                         RETURN(false);
      98                 :            :                 } else {
      99 [ +  - ][ +  + ]:        909 :                     if (!posl->skip_to(last - window + 1))
     100                 :         99 :                         RETURN(false);
     101                 :            :                 }
     102         [ +  - ]:       1253 :                 Xapian::termpos pos = posl->get_position();
     103         [ +  + ]:       1253 :                 if (pos > last) last = pos;
     104                 :       1253 :                 *end++ = posl;
     105         [ +  - ]:       1253 :                 Heap::push(poslists, end, Cmp());
     106                 :       1253 :                 continue;
     107                 :            :             }
     108                 :            : 
     109                 :            :             // We have all the terms within the specified window, but some may
     110                 :            :             // be at the same position (in particular if the same term is
     111                 :            :             // listed twice).  So we work through the terms in ascending
     112                 :            :             // position order, and each time we find one with a duplicate
     113                 :            :             // position, we advance it - if that pushes us out of the window,
     114                 :            :             // we return to the outer loop, otherwise we reinsert it into the
     115                 :            :             // heap at its new position and continue to look for duplicates
     116                 :            :             // we need to adjust.
     117         [ +  - ]:        609 :             Xapian::termpos pos = poslists[0]->get_position();
     118         [ +  - ]:        609 :             Heap::pop(poslists, end, Cmp());
     119                 :        609 :             PositionList ** i = end - 1;
     120                 :            :             while (true) {
     121 [ +  - ][ +  + ]:       1092 :                 if (poslists[0]->get_position() == pos) {
     122 [ +  - ][ +  + ]:         42 :                     if (!poslists[0]->next())
     123                 :         35 :                         RETURN(false);
     124         [ +  - ]:          7 :                     Xapian::termpos newpos = poslists[0]->get_position();
     125 [ +  - ][ -  + ]:          7 :                     if (newpos - end[-1]->get_position() >= window) {
     126                 :            :                         // No longer fits in the window.
     127                 :          0 :                         last = newpos;
     128                 :          0 :                         break;
     129                 :            :                     }
     130         [ +  - ]:          7 :                     Heap::replace(poslists, i, Cmp());
     131                 :          7 :                     continue;
     132                 :            :                 }
     133         [ +  - ]:       1050 :                 pos = poslists[0]->get_position();
     134         [ +  - ]:       1050 :                 Heap::pop(poslists, i, Cmp());
     135         [ +  + ]:       1050 :                 if (--i == poslists) {
     136                 :            :                     Assert(pos - end[-1]->get_position() < window);
     137                 :        574 :                     RETURN(true);
     138                 :            :                 }
     139                 :            :             }
     140                 :            : 
     141         [ #  # ]:          0 :             Heap::make(poslists, end, Cmp());
     142                 :        483 :             continue;
     143                 :            :         }
     144 [ +  - ][ +  + ]:         90 :         if (!poslists[0]->skip_to(last - window + 1))
     145                 :         69 :             break;
     146 [ +  - ][ +  - ]:         21 :         last = max(last, poslists[0]->get_position());
     147         [ +  - ]:         21 :         Heap::replace(poslists, end, Cmp());
     148                 :            :     }
     149                 :            : 
     150                 :       2063 :     RETURN(false);
     151                 :            : }
     152                 :            : 
     153                 :            : Xapian::termcount
     154                 :          0 : NearPostList::get_wdf() const
     155                 :            : {
     156                 :            :     // Calculate an estimate for the wdf of a near postlist.
     157                 :            :     //
     158                 :            :     // The natural definition of the wdf for a near postlist is the number of
     159                 :            :     // times the terms occur in a group within the windowsize in the document -
     160                 :            :     // in other words, the number of times a routine like do_test() could find
     161                 :            :     // a match, if it kept going after finding the first one.  However,
     162                 :            :     // calculating this would be expensive (in CPU time at least - the position
     163                 :            :     // lists are probably already read from disk), and the benefit is dubious
     164                 :            :     // (given that the estimate here is probably only going to be used for
     165                 :            :     // calculating the weight for synonym postlists, and it's not clear that
     166                 :            :     // the above definition is exactly appropriate in this situation, anyway).
     167                 :            :     //
     168                 :            :     // Instead, we could do an estimate of this value, based on the lengths
     169                 :            :     // of the position lists.  Rather than having to open the position lists,
     170                 :            :     // we could use the wdfs, which will be the same value unless the wdfs have
     171                 :            :     // been artificially inflated - in which case we probably want to use the
     172                 :            :     // inflated value anyway (it will have been inflated to represent the fact
     173                 :            :     // that this term is more important than others, so we should make use of
     174                 :            :     // this hint).
     175                 :            :     //
     176                 :            :     // A reasonable way to calculate an estimate would be to consider the
     177                 :            :     // document to be split into W (overlapping) windows, each 1 term apart,
     178                 :            :     // and of the same length as the windowsize used for the phrase.  Then,
     179                 :            :     // calculate the probability that each term is found in this window (as
     180                 :            :     // Prob = wdf / doclen), multiply these probabilities to get the
     181                 :            :     // probability that they're all found in the window, and then multiply by
     182                 :            :     // the windowsize again to get an estimate.
     183                 :            :     //
     184                 :            :     // However, this requires the doclen, which won't always be available (ie,
     185                 :            :     // if the weighting scheme doesn't use it, we won't have read it from
     186                 :            :     // disk).
     187                 :            :     //
     188                 :            :     // Rather than force an access to disk to get the doclen, we use an even
     189                 :            :     // rougher estimate: the minimum wdf in the sub-lists.  This is actually
     190                 :            :     // the upper bound for the wdf (since the phrase can only occur the same
     191                 :            :     // number of times as the least frequent of its constituent terms).
     192                 :            :     //
     193                 :            :     // If this estimate proves to give bad results, we can revisit this and try
     194                 :            :     // a better approximation.
     195                 :          0 :     vector<PostList *>::const_iterator i = terms.begin();
     196         [ #  # ]:          0 :     Xapian::termcount wdf = (*i)->get_wdf();
     197         [ #  # ]:          0 :     while (++i != terms.end()) {
     198 [ #  # ][ #  # ]:          0 :         wdf = min(wdf, (*i)->get_wdf());
     199                 :            :     }
     200                 :          0 :     return wdf;
     201                 :            : }
     202                 :            : 
     203                 :            : Xapian::doccount
     204                 :        212 : NearPostList::get_termfreq_est() const
     205                 :            : {
     206                 :            :     // It's hard to estimate how many times the postlist will match as it
     207                 :            :     // depends a lot on the terms and window, but usually it will occur
     208                 :            :     // significantly less often than the individual terms.
     209                 :        212 :     return pl->get_termfreq_est() / 2;
     210                 :            : }
     211                 :            : 
     212                 :            : TermFreqs
     213                 :          0 : NearPostList::get_termfreq_est_using_stats(
     214                 :            :         const Xapian::Weight::Internal & stats) const
     215                 :            : {
     216                 :            :     LOGCALL(MATCH, TermFreqs, "NearPostList::get_termfreq_est_using_stats", stats);
     217                 :            :     // No idea how to estimate this - do the same as get_termfreq_est() for
     218                 :            :     // now.
     219                 :          0 :     TermFreqs result(pl->get_termfreq_est_using_stats(stats));
     220                 :          0 :     result.termfreq /= 2;
     221                 :          0 :     result.reltermfreq /= 2;
     222                 :          0 :     RETURN(result);
     223                 :            : }
     224                 :            : 
     225                 :            : string
     226                 :          0 : NearPostList::get_description() const
     227                 :            : {
     228         [ #  # ]:          0 :     string m = "(Near ";
     229 [ #  # ][ #  # ]:          0 :     m += str(window);
     230         [ #  # ]:          0 :     m += ' ';
     231 [ #  # ][ #  # ]:          0 :     m += pl->get_description();
     232         [ #  # ]:          0 :     m += ")";
     233                 :          0 :     return m;
     234                 :            : }

Generated by: LCOV version 1.11