LCOV - code coverage report
Current view: top level - matcher - localsubmatch.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 7028d852e609 Lines: 53 80 66.2 %
Date: 2019-02-17 14:59:59 Functions: 3 14 21.4 %
Branches: 57 136 41.9 %

           Branch data     Line data    Source code
       1                 :            : /** @file localsubmatch.cc
       2                 :            :  *  @brief SubMatch class for a local database.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2006,2007,2009,2010,2011,2013,2014,2015,2016,2017,2018 Olly Betts
       5                 :            :  * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
       6                 :            :  *
       7                 :            :  * This program is free software; you can redistribute it and/or modify
       8                 :            :  * it under the terms of the GNU General Public License as published by
       9                 :            :  * the Free Software Foundation; either version 2 of the License, or
      10                 :            :  * (at your option) any later version.
      11                 :            :  *
      12                 :            :  * This program is distributed in the hope that it will be useful,
      13                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      14                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      15                 :            :  * GNU General Public License for more details.
      16                 :            :  *
      17                 :            :  * You should have received a copy of the GNU General Public License
      18                 :            :  * along with this program; if not, write to the Free Software
      19                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      20                 :            :  */
      21                 :            : 
      22                 :            : #include <config.h>
      23                 :            : 
      24                 :            : #include "localsubmatch.h"
      25                 :            : 
      26                 :            : #include "backends/databaseinternal.h"
      27                 :            : #include "debuglog.h"
      28                 :            : #include "api/emptypostlist.h"
      29                 :            : #include "extraweightpostlist.h"
      30                 :            : #include "api/leafpostlist.h"
      31                 :            : #include "omassert.h"
      32                 :            : #include "queryoptimiser.h"
      33                 :            : #include "synonympostlist.h"
      34                 :            : #include "api/termlist.h"
      35                 :            : #include "weight/weightinternal.h"
      36                 :            : 
      37                 :            : #include "xapian/error.h"
      38                 :            : 
      39                 :            : #include <memory>
      40                 :            : #include <map>
      41                 :            : #include <string>
      42                 :            : 
      43                 :            : using namespace std;
      44                 :            : 
      45                 :            : /** Xapian::Weight subclass which adds laziness.
      46                 :            :  *
      47                 :            :  *  For terms from a wildcard when remote databases are involved, we need to
      48                 :            :  *  delay calling init_() on the weight object until the stats for the terms
      49                 :            :  *  from the wildcard have been collated.
      50                 :            :  */
      51         [ #  # ]:          0 : class LazyWeight : public Xapian::Weight {
      52                 :            :     LeafPostList * pl;
      53                 :            : 
      54                 :            :     Xapian::Weight * real_wt;
      55                 :            : 
      56                 :            :     Xapian::Weight::Internal * stats;
      57                 :            : 
      58                 :            :     Xapian::termcount qlen;
      59                 :            : 
      60                 :            :     Xapian::termcount wqf;
      61                 :            : 
      62                 :            :     double factor;
      63                 :            : 
      64                 :            :     LazyWeight * clone() const;
      65                 :            : 
      66                 :            :     void init(double factor_);
      67                 :            : 
      68                 :            :   public:
      69                 :          0 :     LazyWeight(LeafPostList * pl_,
      70                 :            :                Xapian::Weight * real_wt_,
      71                 :            :                Xapian::Weight::Internal * stats_,
      72                 :            :                Xapian::termcount qlen_,
      73                 :            :                Xapian::termcount wqf__,
      74                 :            :                double factor_)
      75                 :            :         : pl(pl_),
      76                 :            :           real_wt(real_wt_),
      77                 :            :           stats(stats_),
      78                 :            :           qlen(qlen_),
      79                 :            :           wqf(wqf__),
      80                 :          0 :           factor(factor_)
      81                 :          0 :     { }
      82                 :            : 
      83                 :            :     std::string name() const;
      84                 :            : 
      85                 :            :     std::string serialise() const;
      86                 :            :     LazyWeight * unserialise(const std::string & serialised) const;
      87                 :            : 
      88                 :            :     double get_sumpart(Xapian::termcount wdf,
      89                 :            :                        Xapian::termcount doclen,
      90                 :            :                        Xapian::termcount uniqterms) const;
      91                 :            :     double get_maxpart() const;
      92                 :            : 
      93                 :            :     double get_sumextra(Xapian::termcount doclen,
      94                 :            :                         Xapian::termcount uniqterms) const;
      95                 :            :     double get_maxextra() const;
      96                 :            : };
      97                 :            : 
      98                 :            : LazyWeight *
      99                 :          0 : LazyWeight::clone() const
     100                 :            : {
     101 [ #  # ][ #  # ]:          0 :     throw Xapian::InvalidOperationError("LazyWeight::clone()");
                 [ #  # ]
     102                 :            : }
     103                 :            : 
     104                 :            : void
     105                 :          0 : LazyWeight::init(double factor_)
     106                 :            : {
     107                 :            :     (void)factor_;
     108 [ #  # ][ #  # ]:          0 :     throw Xapian::InvalidOperationError("LazyWeight::init()");
                 [ #  # ]
     109                 :            : }
     110                 :            : 
     111                 :            : string
     112                 :          0 : LazyWeight::name() const
     113                 :            : {
     114         [ #  # ]:          0 :     string desc = "LazyWeight(";
     115 [ #  # ][ #  # ]:          0 :     desc += real_wt->name();
     116         [ #  # ]:          0 :     desc += ")";
     117                 :          0 :     return desc;
     118                 :            : }
     119                 :            : 
     120                 :            : string
     121                 :          0 : LazyWeight::serialise() const
     122                 :            : {
     123 [ #  # ][ #  # ]:          0 :     throw Xapian::InvalidOperationError("LazyWeight::serialise()");
                 [ #  # ]
     124                 :            : }
     125                 :            : 
     126                 :            : LazyWeight *
     127                 :          0 : LazyWeight::unserialise(const string &) const
     128                 :            : {
     129 [ #  # ][ #  # ]:          0 :     throw Xapian::InvalidOperationError("LazyWeight::unserialise()");
                 [ #  # ]
     130                 :            : }
     131                 :            : 
     132                 :            : double
     133                 :          0 : LazyWeight::get_sumpart(Xapian::termcount wdf,
     134                 :            :                         Xapian::termcount doclen,
     135                 :            :                         Xapian::termcount uniqterms) const
     136                 :            : {
     137                 :            :     (void)wdf;
     138                 :            :     (void)doclen;
     139                 :            :     (void)uniqterms;
     140 [ #  # ][ #  # ]:          0 :     throw Xapian::InvalidOperationError("LazyWeight::get_sumpart()");
                 [ #  # ]
     141                 :            : }
     142                 :            : 
     143                 :            : double
     144                 :          0 : LazyWeight::get_sumextra(Xapian::termcount doclen,
     145                 :            :                          Xapian::termcount uniqterms) const
     146                 :            : {
     147                 :            :     (void)doclen;
     148                 :            :     (void)uniqterms;
     149 [ #  # ][ #  # ]:          0 :     throw Xapian::InvalidOperationError("LazyWeight::get_sumextra()");
                 [ #  # ]
     150                 :            : }
     151                 :            : 
     152                 :            : double
     153                 :          0 : LazyWeight::get_maxpart() const
     154                 :            : {
     155                 :            :     // This gets called first for the case we care about.
     156                 :          0 :     return pl->resolve_lazy_termweight(real_wt, stats, qlen, wqf, factor);
     157                 :            : }
     158                 :            : 
     159                 :            : double
     160                 :          0 : LazyWeight::get_maxextra() const
     161                 :            : {
     162 [ #  # ][ #  # ]:          0 :     throw Xapian::InvalidOperationError("LazyWeight::get_maxextra()");
                 [ #  # ]
     163                 :            : }
     164                 :            : 
     165                 :            : PostList *
     166                 :     178077 : LocalSubMatch::get_postlist(PostListTree * matcher,
     167                 :            :                             Xapian::termcount * total_subqs_ptr)
     168                 :            : {
     169                 :            :     LOGCALL(MATCH, PostList *, "LocalSubMatch::get_postlist", matcher | total_subqs_ptr);
     170                 :            : 
     171         [ -  + ]:     178077 :     if (query.empty())
     172         [ #  # ]:          0 :         RETURN(new EmptyPostList); // MatchNothing
     173                 :            : 
     174                 :            :     // Build the postlist tree for the query.  This calls
     175                 :            :     // LocalSubMatch::open_post_list() for each term in the query.
     176                 :            :     PostList * pl;
     177                 :            :     {
     178         [ +  - ]:     178077 :         QueryOptimiser opt(*db, *this, matcher, full_db_has_positions);
     179         [ +  + ]:     178077 :         pl = query.internal->postlist(&opt, 1.0);
     180                 :     178077 :         *total_subqs_ptr = opt.get_total_subqs();
     181                 :            :     }
     182                 :            : 
     183         [ +  - ]:     178046 :     unique_ptr<Xapian::Weight> extra_wt(wt_factory.clone());
     184                 :            :     // Only uses term-independent stats.
     185         [ +  - ]:     178046 :     extra_wt->init_(*total_stats, qlen);
     186 [ +  - ][ +  + ]:     178046 :     if (extra_wt->get_maxextra() != 0.0) {
     187                 :            :         // There's a term-independent weight contribution, so we combine the
     188                 :            :         // postlist tree with an ExtraWeightPostList which adds in this
     189                 :            :         // contribution.
     190 [ +  - ][ +  - ]:       1334 :         pl = new ExtraWeightPostList(pl, extra_wt.release(), matcher);
     191                 :            :     }
     192                 :            : 
     193                 :     178046 :     RETURN(pl);
     194                 :            : }
     195                 :            : 
     196                 :            : PostList *
     197                 :        733 : LocalSubMatch::make_synonym_postlist(PostListTree* pltree,
     198                 :            :                                      PostList* or_pl,
     199                 :            :                                      double factor,
     200                 :            :                                      bool wdf_disjoint)
     201                 :            : {
     202                 :            :     LOGCALL(MATCH, PostList *, "LocalSubMatch::make_synonym_postlist", pltree | or_pl | factor | wdf_disjoint);
     203 [ +  - ][ +  + ]:        733 :     if (rare(or_pl->get_termfreq_max() == 0)) {
     204                 :            :         // or_pl is an EmptyPostList or equivalent.
     205                 :         21 :         return or_pl;
     206                 :            :     }
     207                 :            :     LOGVALUE(MATCH, or_pl->get_termfreq_est());
     208                 :            :     unique_ptr<SynonymPostList> res(new SynonymPostList(or_pl, db, pltree,
     209 [ +  - ][ +  - ]:        712 :                                                         wdf_disjoint));
     210         [ +  - ]:       1424 :     unique_ptr<Xapian::Weight> wt(wt_factory.clone());
     211                 :            : 
     212                 :        712 :     TermFreqs freqs;
     213                 :            :     // Avoid calling get_termfreq_est_using_stats() if the database is empty
     214                 :            :     // so we don't need to special case that repeatedly when implementing it.
     215                 :            :     // FIXME: it would be nicer to handle an empty database higher up, though
     216                 :            :     // we need to catch the case where all the non-empty subdatabases have
     217                 :            :     // failed, so we can't just push this right up to the start of get_mset().
     218         [ +  - ]:        712 :     if (usual(total_stats->collection_size != 0)) {
     219         [ +  - ]:        712 :         freqs = or_pl->get_termfreq_est_using_stats(*total_stats);
     220                 :            :     }
     221                 :            :     wt->init_(*total_stats, qlen, factor,
     222         [ +  - ]:        712 :               freqs.termfreq, freqs.reltermfreq, freqs.collfreq);
     223                 :            : 
     224         [ +  - ]:        712 :     res->set_weight(wt.release());
     225                 :       1445 :     RETURN(res.release());
     226                 :            : }
     227                 :            : 
     228                 :            : PostList *
     229                 :     313310 : LocalSubMatch::open_post_list(const string& term,
     230                 :            :                               Xapian::termcount wqf,
     231                 :            :                               double factor,
     232                 :            :                               bool need_positions,
     233                 :            :                               bool in_synonym,
     234                 :            :                               QueryOptimiser * qopt,
     235                 :            :                               bool lazy_weight)
     236                 :            : {
     237                 :            :     LOGCALL(MATCH, PostList *, "LocalSubMatch::open_post_list", term | wqf | factor | need_positions | qopt | lazy_weight);
     238                 :            : 
     239                 :     313310 :     bool weighted = false;
     240                 :            : 
     241                 :     313310 :     LeafPostList * pl = NULL;
     242         [ +  + ]:     313310 :     if (term.empty()) {
     243                 :            :         Assert(!need_positions);
     244                 :         72 :         pl = db->open_leaf_post_list(term, false);
     245                 :            :     } else {
     246                 :     313238 :         weighted = (factor != 0.0);
     247         [ +  + ]:     313238 :         if (!need_positions) {
     248         [ +  + ]:     622396 :             if ((!weighted && !in_synonym) ||
           [ +  +  +  + ]
                 [ +  + ]
     249                 :     311042 :                 !wt_factory.get_sumpart_needs_wdf_()) {
     250                 :            :                 Xapian::doccount sub_tf;
     251         [ +  - ]:        849 :                 db->get_freqs(term, &sub_tf, NULL);
     252 [ +  - ][ +  + ]:        849 :                 if (sub_tf == db->get_doccount()) {
     253                 :            :                     // If we're not going to use the wdf or term positions, and
     254                 :            :                     // the term indexes all documents, we can replace it with
     255                 :            :                     // the MatchAll postlist, which is especially efficient if
     256                 :            :                     // there are no gaps in the docids.
     257 [ +  - ][ +  - ]:        136 :                     pl = db->open_leaf_post_list(string(), false);
     258                 :            : 
     259                 :            :                     // Set the term name so the postlist looks up the correct
     260                 :            :                     // term frequencies - this is necessary if the weighting
     261                 :            :                     // scheme needs collection frequency or reltermfreq
     262                 :            :                     // (termfreq would be correct anyway since it's just the
     263                 :            :                     // collection size in this case).
     264         [ +  - ]:     311354 :                     pl->set_term(term);
     265                 :            :                 }
     266                 :            :             }
     267                 :            :         }
     268                 :            : 
     269         [ +  + ]:     313238 :         if (!pl) {
     270                 :     313102 :             const LeafPostList * hint = qopt->get_hint_postlist();
     271         [ +  + ]:     313102 :             if (hint)
     272                 :     148041 :                 pl = hint->open_nearby_postlist(term, need_positions);
     273         [ +  + ]:     313102 :             if (!pl) {
     274                 :     189743 :                 pl = db->open_leaf_post_list(term, need_positions);
     275                 :            :             }
     276                 :     313102 :             qopt->set_hint_postlist(pl);
     277                 :            :         }
     278                 :            :     }
     279                 :            : 
     280         [ +  + ]:     313310 :     if (lazy_weight) {
     281                 :            :         // Term came from a wildcard, but we may already have that term in the
     282                 :            :         // query anyway, so check before accumulating its TermFreqs.
     283         [ +  - ]:        769 :         map<string, TermFreqs>::iterator i = total_stats->termfreqs.find(term);
     284         [ +  + ]:        769 :         if (i == total_stats->termfreqs.end()) {
     285                 :            :             Xapian::doccount sub_tf;
     286                 :            :             Xapian::termcount sub_cf;
     287         [ +  - ]:        763 :             db->get_freqs(term, &sub_tf, &sub_cf);
     288 [ +  - ][ +  - ]:        769 :             total_stats->termfreqs.insert({term, TermFreqs(sub_tf, 0, sub_cf)});
     289                 :            :         }
     290                 :            :     }
     291                 :            : 
     292         [ +  + ]:     313310 :     if (weighted) {
     293                 :     310665 :         Xapian::Weight * wt = wt_factory.clone();
     294         [ +  - ]:     310665 :         if (!lazy_weight) {
     295                 :     310665 :             wt->init_(*total_stats, qlen, term, wqf, factor);
     296                 :     310665 :             total_stats->set_max_part(term, wt->get_maxpart());
     297                 :            :         } else {
     298                 :            :             // Delay initialising the actual weight object, so that we can
     299                 :            :             // gather stats for the terms lazily expanded from a wildcard
     300                 :            :             // (needed for the remote database case).
     301                 :          0 :             wt = new LazyWeight(pl, wt, total_stats, qlen, wqf, factor);
     302                 :            :         }
     303                 :     310665 :         pl->set_termweight(wt);
     304                 :            :     }
     305                 :     313310 :     RETURN(pl);
     306                 :            : }

Generated by: LCOV version 1.11