LCOV - code coverage report
Current view: top level - matcher - exactphrasepostlist.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 954b5873a738 Lines: 57 66 86.4 %
Date: 2019-06-30 05:20:33 Functions: 9 10 90.0 %
Branches: 29 50 58.0 %

           Branch data     Line data    Source code
       1                 :            : /** @file exactphrasepostlist.cc
       2                 :            :  * @brief Return docs containing terms forming a particular exact phrase.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2006,2007,2009,2010,2011,2014,2015,2017 Olly Betts
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or modify
       7                 :            :  * it under the terms of the GNU General Public License as published by
       8                 :            :  * the Free Software Foundation; either version 2 of the License, or
       9                 :            :  * (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful,
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      19                 :            :  */
      20                 :            : 
      21                 :            : #include <config.h>
      22                 :            : 
      23                 :            : #include "exactphrasepostlist.h"
      24                 :            : 
      25                 :            : #include "debuglog.h"
      26                 :            : #include "backends/positionlist.h"
      27                 :            : #include "omassert.h"
      28                 :            : 
      29                 :            : #include <algorithm>
      30                 :            : #include <vector>
      31                 :            : 
      32                 :            : using namespace std;
      33                 :            : 
      34                 :        344 : ExactPhrasePostList::ExactPhrasePostList(PostList *source_,
      35                 :            :                                          const vector<PostList*>::const_iterator &terms_begin,
      36                 :            :                                          const vector<PostList*>::const_iterator &terms_end,
      37                 :            :                                          PostListTree* pltree_)
      38         [ +  - ]:        344 :     : SelectPostList(source_, pltree_), terms(terms_begin, terms_end)
      39                 :            : {
      40                 :        344 :     size_t n = terms.size();
      41                 :            :     Assert(n > 1);
      42 [ +  - ][ +  - ]:        344 :     poslists = new PositionList*[n];
      43                 :            :     try {
      44 [ +  - ][ +  - ]:        344 :         order = new unsigned[n];
      45                 :          0 :     } catch (...) {
      46         [ #  # ]:          0 :         delete [] poslists;
      47                 :          0 :         throw;
      48                 :            :     }
      49         [ +  + ]:       1135 :     for (size_t i = 0; i < n; ++i) order[i] = unsigned(i);
      50                 :        344 : }
      51                 :            : 
      52                 :       1032 : ExactPhrasePostList::~ExactPhrasePostList()
      53                 :            : {
      54         [ +  - ]:        344 :     delete [] poslists;
      55         [ +  - ]:        344 :     delete [] order;
      56         [ -  + ]:        688 : }
      57                 :            : 
      58                 :            : void
      59                 :        861 : ExactPhrasePostList::start_position_list(unsigned i)
      60                 :            : {
      61                 :        861 :     poslists[i] = terms[order[i]]->read_position_list();
      62                 :        861 : }
      63                 :            : 
      64                 :            : class TermCompare {
      65                 :            :     vector<PostList *> & terms;
      66                 :            : 
      67                 :            :   public:
      68                 :        414 :     explicit TermCompare(vector<PostList *> & terms_) : terms(terms_) { }
      69                 :            : 
      70                 :        989 :     bool operator()(unsigned a, unsigned b) const {
      71                 :        989 :         return terms[a]->get_wdf() < terms[b]->get_wdf();
      72                 :            :     }
      73                 :            : };
      74                 :            : 
      75                 :            : bool
      76                 :        414 : ExactPhrasePostList::test_doc()
      77                 :            : {
      78                 :            :     LOGCALL(MATCH, bool, "ExactPhrasePostList::test_doc", NO_ARGS);
      79                 :            : 
      80                 :            :     // We often don't need to read all the position lists, so rather than using
      81                 :            :     // the shortest position lists first, we approximate by using the terms
      82                 :            :     // with the lowest wdf first.  This will typically give the same or a very
      83                 :            :     // similar order.
      84         [ +  - ]:        414 :     sort(order, order + terms.size(), TermCompare(terms));
      85                 :            : 
      86                 :            :     // If the first term we check only occurs too close to the start of the
      87                 :            :     // document, we only need to read one term's positions.  E.g. search for
      88                 :            :     // "ripe mango" when the only occurrence of 'mango' in the current document
      89                 :            :     // is at position 0.
      90                 :        414 :     start_position_list(0);
      91         [ +  + ]:        414 :     if (!poslists[0]->skip_to(order[0]))
      92                 :          7 :         RETURN(false);
      93                 :            : 
      94                 :            :     // If we get here, we'll need to read the positionlists for at least two
      95                 :            :     // terms, so check the true positionlist length for the two terms with the
      96                 :            :     // lowest wdf and if necessary swap them so the true shorter one is first.
      97                 :        407 :     start_position_list(1);
      98         [ -  + ]:        407 :     if (poslists[0]->get_approx_size() > poslists[1]->get_approx_size()) {
      99         [ #  # ]:          0 :         if (!poslists[1]->skip_to(order[1]))
     100                 :          0 :             RETURN(false);
     101                 :          0 :         swap(poslists[0], poslists[1]);
     102                 :          0 :         swap(order[0], order[1]);
     103                 :            :     }
     104                 :            : 
     105                 :        407 :     unsigned read_hwm = 1;
     106                 :        407 :     Xapian::termpos idx0 = order[0];
     107                 :        407 :     Xapian::termpos base = poslists[0]->get_position() - idx0;
     108                 :        407 :     unsigned i = 1;
     109                 :            :     while (true) {
     110         [ +  + ]:        496 :         if (i > read_hwm) {
     111                 :         40 :             read_hwm = i;
     112                 :         40 :             start_position_list(i);
     113                 :            :             // FIXME: consider comparing with poslist[0] and swapping
     114                 :            :             // if less common.  Should we allow for the number of positions
     115                 :            :             // we've read from poslist[0] already?
     116                 :            :         }
     117                 :        496 :         Xapian::termpos idx = order[i];
     118                 :        496 :         Xapian::termpos required = base + idx;
     119         [ +  + ]:        496 :         if (!poslists[i]->skip_to(required))
     120                 :         94 :             RETURN(false);
     121                 :        402 :         Xapian::termpos got = poslists[i]->get_position();
     122         [ +  + ]:        402 :         if (got == required) {
     123         [ +  + ]:        238 :             if (++i == terms.size()) RETURN(true);
     124                 :         40 :             continue;
     125                 :            :         }
     126         [ +  + ]:        164 :         if (!poslists[0]->skip_to(got - idx + idx0))
     127                 :        115 :             RETURN(false);
     128                 :         49 :         base = poslists[0]->get_position() - idx0;
     129                 :         49 :         i = 1;
     130                 :        503 :     }
     131                 :            : }
     132                 :            : 
     133                 :            : Xapian::termcount
     134                 :         42 : ExactPhrasePostList::get_wdf() const
     135                 :            : {
     136                 :            :     // Calculate an estimate for the wdf of an exact phrase postlist.
     137                 :            :     //
     138                 :            :     // We use the minimum wdf of a sub-postlist as our estimate.  See the
     139                 :            :     // comment in NearPostList::get_wdf() for justification of this estimate.
     140                 :         42 :     vector<PostList *>::const_iterator i = terms.begin();
     141         [ +  - ]:         42 :     Xapian::termcount wdf = (*i)->get_wdf();
     142         [ +  + ]:         84 :     while (++i != terms.end()) {
     143 [ +  - ][ +  - ]:         42 :         wdf = min(wdf, (*i)->get_wdf());
     144                 :            :     }
     145                 :         42 :     return wdf;
     146                 :            : }
     147                 :            : 
     148                 :            : Xapian::doccount
     149                 :        519 : ExactPhrasePostList::get_termfreq_est() const
     150                 :            : {
     151                 :            :     // It's hard to estimate how many times the exact phrase will occur as
     152                 :            :     // it depends a lot on the phrase, but usually the exact phrase will
     153                 :            :     // occur significantly less often than the individual terms.
     154                 :            :     //
     155                 :            :     // We divide by 4 here rather than by 2 as we do for NearPostList and
     156                 :            :     // PhrasePostList, as a very rough heuristic to represent the fact that the
     157                 :            :     // words must occur exactly in order, and phrases are therefore rarer than
     158                 :            :     // near matches and (non-exact) phrase matches.
     159                 :        519 :     return pl->get_termfreq_est() / 4;
     160                 :            : }
     161                 :            : 
     162                 :            : TermFreqs
     163                 :         16 : ExactPhrasePostList::get_termfreq_est_using_stats(
     164                 :            :         const Xapian::Weight::Internal & stats) const
     165                 :            : {
     166                 :            :     LOGCALL(MATCH, TermFreqs, "ExactPhrasePostList::get_termfreq_est_using_stats", stats);
     167                 :            :     // No idea how to estimate this - do the same as get_termfreq_est() for
     168                 :            :     // now.
     169                 :         16 :     TermFreqs result(pl->get_termfreq_est_using_stats(stats));
     170                 :         16 :     result.termfreq /= 4;
     171                 :         16 :     result.reltermfreq /= 4;
     172                 :         16 :     RETURN(result);
     173                 :            : }
     174                 :            : 
     175                 :            : string
     176                 :          0 : ExactPhrasePostList::get_description() const
     177                 :            : {
     178 [ #  # ][ #  # ]:          0 :     return "(ExactPhrase " + pl->get_description() + ")";
     179                 :            : }

Generated by: LCOV version 1.11