LCOV - code coverage report
Current view: top level - matcher - protomset.h (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 7028d852e609 Lines: 221 231 95.7 %
Date: 2019-02-17 14:59:59 Functions: 16 16 100.0 %
Branches: 194 233 83.3 %

           Branch data     Line data    Source code
       1                 :            : /** @file protomset.h
       2                 :            :  * @brief ProtoMSet class
       3                 :            :  */
       4                 :            : /* Copyright (C) 2004,2007,2017,2018,2019 Olly Betts
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or modify
       7                 :            :  * it under the terms of the GNU General Public License as published by
       8                 :            :  * the Free Software Foundation; either version 2 of the License, or
       9                 :            :  * (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful,
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      19                 :            :  */
      20                 :            : 
      21                 :            : #ifndef XAPIAN_INCLUDED_PROTOMSET_H
      22                 :            : #define XAPIAN_INCLUDED_PROTOMSET_H
      23                 :            : 
      24                 :            : #include "api/enquireinternal.h"
      25                 :            : #include "api/result.h"
      26                 :            : #include "collapser.h"
      27                 :            : #include "heap.h"
      28                 :            : #include "matchtimeout.h"
      29                 :            : #include "msetcmp.h"
      30                 :            : #include "omassert.h"
      31                 :            : #include "spymaster.h"
      32                 :            : #include "stdclamp.h"
      33                 :            : 
      34                 :            : #include <algorithm>
      35                 :            : 
      36                 :            : using Xapian::Internal::intrusive_ptr;
      37                 :            : 
      38                 :     298414 : class ProtoMSet {
      39                 :            :     /// Adapt MSetCmp to be usable with min_heap.
      40                 :            :     class MCmpAdaptor {
      41                 :            :         ProtoMSet* protomset;
      42                 :            : 
      43                 :            :       public:
      44                 :   17545146 :         explicit MCmpAdaptor(ProtoMSet* protomset_) : protomset(protomset_) {}
      45                 :            : 
      46                 :  228166599 :         bool operator()(Xapian::doccount a, Xapian::doccount b) const {
      47                 :  228166599 :             return protomset->mcmp(protomset->results[a],
      48                 :  456333198 :                                    protomset->results[b]);
      49                 :            :         }
      50                 :            :     };
      51                 :            : 
      52                 :            :     friend class MCmpAdaptor;
      53                 :            : 
      54                 :            :     /** Maximum size the ProtoMSet needs to grow to.
      55                 :            :      *
      56                 :            :      *  This is the maximum rank we care about.
      57                 :            :      */
      58                 :            :     Xapian::doccount max_size;
      59                 :            : 
      60                 :            :     Xapian::doccount check_at_least;
      61                 :            : 
      62                 :            :     Xapian::Enquire::Internal::sort_setting sort_by;
      63                 :            : 
      64                 :            :     MSetCmp mcmp;
      65                 :            : 
      66                 :            :     /** Minimum threshold on the weight.
      67                 :            :      *
      68                 :            :      *  If the primary result ordering is by decreasing relevance (i.e. @a
      69                 :            :      *  sort_by is REL or REL_VAL) then once the min_heap kicks in this
      70                 :            :      *  threshold is raised to the lowest weight in the proto-mset.
      71                 :            :      *
      72                 :            :      *  Enquire::set_cutoff() can also affect min_weight - an absolute
      73                 :            :      *  threshold determines the initial value; a percentage threshold raises
      74                 :            :      *  the threshold each time max_weight increases (unless it's already
      75                 :            :      *  higher than the value the percentage threshold results in).
      76                 :            :      */
      77                 :            :     double min_weight = 0.0;
      78                 :            : 
      79                 :            :     /** The highest document weight seen.
      80                 :            :      *
      81                 :            :      * This weight may not actually be present in @a results if we're not
      82                 :            :      * sorting primarily by relevance, or if min_weight > max_weight.
      83                 :            :      */
      84                 :            :     double max_weight = 0.0;
      85                 :            : 
      86                 :            :     bool min_weight_pending = false;
      87                 :            : 
      88                 :            :     /** Count of how many known matching documents have been processed so far.
      89                 :            :      *
      90                 :            :      *  Used to implement "check_at_least".
      91                 :            :      */
      92                 :            :     Xapian::doccount known_matching_docs = 0;
      93                 :            : 
      94                 :            :     /// The items in the proto-MSet.
      95                 :            :     vector<Result> results;
      96                 :            : 
      97                 :            :     /** A heap of offsets into @a results.
      98                 :            :      *
      99                 :            :      *  Created lazily once we actually need it.
     100                 :            :      */
     101                 :            :     vector<Xapian::doccount> min_heap;
     102                 :            : 
     103                 :            :     /// First entry wanted in MSet.
     104                 :            :     Xapian::doccount first;
     105                 :            : 
     106                 :            :     /** How many weighted leaf subqueries there are.
     107                 :            :      *
     108                 :            :      *  Used for scaling percentages when the highest weighted document doesn't
     109                 :            :      *  "match all terms".
     110                 :            :      */
     111                 :            :     Xapian::termcount total_subqs;
     112                 :            : 
     113                 :            :     /// The number of subqueries which matched to give max_weight.
     114                 :            :     Xapian::termcount max_weight_subqs_matched = 0;
     115                 :            : 
     116                 :            :     int percent_threshold;
     117                 :            : 
     118                 :            :     double percent_threshold_factor;
     119                 :            : 
     120                 :            :     double percent_scale = 0.0;
     121                 :            : 
     122                 :            :     PostListTree& pltree;
     123                 :            : 
     124                 :            :     Collapser collapser;
     125                 :            : 
     126                 :            :     double max_possible;
     127                 :            : 
     128                 :            :     bool stop_once_full;
     129                 :            : 
     130                 :            :     TimeOut timeout;
     131                 :            : 
     132                 :            :   public:
     133                 :     149207 :     ProtoMSet(Xapian::doccount first_,
     134                 :            :               Xapian::doccount max_items,
     135                 :            :               Xapian::doccount check_at_least_,
     136                 :            :               MSetCmp mcmp_,
     137                 :            :               Xapian::Enquire::Internal::sort_setting sort_by_,
     138                 :            :               Xapian::termcount total_subqs_,
     139                 :            :               PostListTree& pltree_,
     140                 :            :               Xapian::valueno collapse_key,
     141                 :            :               Xapian::doccount collapse_max,
     142                 :            :               int percent_threshold_,
     143                 :            :               double percent_threshold_factor_,
     144                 :            :               double max_possible_,
     145                 :            :               bool stop_once_full_,
     146                 :            :               double time_limit)
     147                 :     149207 :         : max_size(first_ + max_items),
     148                 :            :           check_at_least(check_at_least_),
     149                 :            :           sort_by(sort_by_),
     150                 :            :           mcmp(mcmp_),
     151                 :            :           first(first_),
     152                 :            :           total_subqs(total_subqs_),
     153                 :            :           percent_threshold(percent_threshold_),
     154                 :            :           percent_threshold_factor(percent_threshold_factor_),
     155                 :            :           pltree(pltree_),
     156                 :            :           collapser(collapse_key, collapse_max, results, mcmp),
     157                 :            :           max_possible(max_possible_),
     158                 :            :           stop_once_full(stop_once_full_),
     159         [ +  - ]:     149207 :           timeout(time_limit)
     160                 :            :     {
     161         [ +  - ]:     149207 :         results.reserve(max_size);
     162                 :     149207 :     }
     163                 :            : 
     164                 :            :     ProtoMSet(const ProtoMSet&) = delete;
     165                 :            : 
     166                 :            :     ProtoMSet& operator=(const ProtoMSet&) = delete;
     167                 :            : 
     168                 :            :     Collapser& get_collapser() { return collapser; }
     169                 :            : 
     170                 :     629208 :     bool full() const { return results.size() == max_size; }
     171                 :            : 
     172                 :   95620052 :     double get_min_weight() const { return min_weight; }
     173                 :            : 
     174                 :   34344932 :     void update_max_weight(double weight) {
     175         [ +  + ]:   34344932 :         if (weight <= max_weight)
     176                 :   33446943 :             return;
     177                 :            : 
     178                 :     897989 :         max_weight = weight;
     179                 :     897989 :         max_weight_subqs_matched = pltree.count_matching_subqs();
     180         [ +  + ]:     897989 :         if (percent_threshold) {
     181                 :      34402 :             set_new_min_weight(weight * percent_threshold_factor);
     182                 :            :         }
     183                 :            :     }
     184                 :            : 
     185                 :   17542466 :     bool checked_enough() {
     186         [ +  + ]:   17542466 :         if (known_matching_docs >= check_at_least) {
     187                 :   17539918 :             return true;
     188                 :            :         }
     189 [ +  - ][ +  + ]:       2548 :         if (known_matching_docs >= max_size && timeout.timed_out()) {
                 [ +  + ]
     190                 :          3 :             check_at_least = max_size;
     191                 :          3 :             return true;
     192                 :            :         }
     193                 :       2545 :         return false;
     194                 :            :     }
     195                 :            : 
     196                 :            :     /** Resolve a pending min_weight change.
     197                 :            :      *
     198                 :            :      *  Only called when there's a percentage weight cut-off.
     199                 :            :      */
     200                 :      17193 :     bool handle_min_weight_pending(bool finalising = false) {
     201                 :            :         // min_weight_pending shouldn't get set when unweighted.
     202                 :            :         Assert(sort_by != Xapian::Enquire::Internal::DOCID);
     203                 :      17193 :         min_weight_pending = false;
     204 [ -  + ][ #  # ]:      17193 :         bool weight_first = (sort_by == Xapian::Enquire::Internal::REL ||
     205                 :      17193 :                              sort_by == Xapian::Enquire::Internal::REL_VAL);
     206                 :      17193 :         double new_min_weight = HUGE_VAL;
     207                 :      17193 :         size_t j = 0;
     208                 :      17193 :         size_t min_elt = 0;
     209         [ +  + ]:      68679 :         for (size_t i = 0; i != results.size(); ++i) {
     210         [ +  + ]:      51486 :             if (results[i].get_weight() < min_weight) {
     211                 :       3997 :                 continue;
     212                 :            :             }
     213         [ +  + ]:      47489 :             if (i != j) {
     214                 :       3612 :                 results[j] = std::move(results[i]);
     215         [ +  + ]:       3612 :                 if (collapser) {
     216                 :       3605 :                     collapser.result_has_moved(i, j);
     217                 :            :                 }
     218                 :            :             }
     219 [ +  - ][ +  + ]:      47489 :             if (weight_first && results[j].get_weight() < new_min_weight) {
                 [ +  + ]
     220                 :      24795 :                 new_min_weight = results[j].get_weight();
     221                 :      24795 :                 min_elt = j;
     222                 :            :             }
     223                 :      47489 :             ++j;
     224                 :            :         }
     225         [ +  - ]:      17193 :         if (weight_first) {
     226         [ +  + ]:      17193 :             if (finalising) {
     227         [ +  + ]:      17178 :                 if (known_matching_docs >= check_at_least)
     228                 :       1204 :                     min_weight = new_min_weight;
     229                 :            :             } else {
     230         [ +  - ]:         15 :                 if (checked_enough())
     231                 :         15 :                     min_weight = new_min_weight;
     232                 :            :             }
     233                 :            :         }
     234         [ +  + ]:      17193 :         if (j != results.size()) {
     235         [ +  - ]:       3626 :             results.erase(results.begin() + j, results.end());
     236         [ +  + ]:       3626 :             if (!finalising) {
     237                 :          7 :                 return false;
     238                 :            :             }
     239                 :            :         }
     240 [ +  + ][ -  + ]:      17186 :         if (!finalising && min_elt != 0 && !collapser) {
         [ #  # ][ -  + ]
     241                 :            :             // Install the correct element at the tip of the heap, so
     242                 :            :             // that Heap::make() has less to do.  NB Breaks collapsing.
     243                 :          0 :             swap(results[0], results[min_elt]);
     244                 :            :         }
     245                 :      17193 :         return true;
     246                 :            :     }
     247                 :            : 
     248                 :      26552 :     bool early_reject(Result& new_item,
     249                 :            :                       bool calculated_weight,
     250                 :            :                       SpyMaster& spymaster,
     251                 :            :                       const Xapian::Document& doc) {
     252         [ +  + ]:      26552 :         if (min_heap.empty())
     253                 :      19072 :             return false;
     254                 :            : 
     255                 :            :         // We're sorting by value (in part at least), so compare the item
     256                 :            :         // against the lowest currently in the proto-mset.  If sort_by is VAL,
     257                 :            :         // then new_item.get_weight() won't be set yet, but that doesn't matter
     258                 :            :         // since it's not used by the sort function.
     259                 :       7480 :         Xapian::doccount worst_idx = min_heap.front();
     260         [ +  + ]:       7480 :         if (mcmp(new_item, results[worst_idx]))
     261                 :       3621 :             return false;
     262                 :            : 
     263                 :            :         // The candidate isn't good enough to make the proto-mset, but there
     264                 :            :         // are still things we may need to do with it.
     265                 :            : 
     266                 :            :         // If we're collapsing, we need to check if this would have been
     267                 :            :         // collapsed before incrementing known_matching_docs.
     268         [ +  + ]:       3859 :         if (!collapser) {
     269                 :       3761 :             ++known_matching_docs;
     270                 :            :             double weight =
     271         [ +  + ]:       3761 :                 calculated_weight ? new_item.get_weight() : pltree.get_weight();
     272                 :       3761 :             spymaster(doc, weight);
     273                 :       3761 :             update_max_weight(weight);
     274                 :       3761 :             return true;
     275                 :            :         }
     276                 :            : 
     277         [ -  + ]:         98 :         if (checked_enough()) {
     278                 :            :             // We've seen enough items so can drop this one.
     279                 :            :             double weight =
     280         [ #  # ]:          0 :                 calculated_weight ? new_item.get_weight() : pltree.get_weight();
     281                 :          0 :             update_max_weight(weight);
     282                 :          0 :             return true;
     283                 :            :         }
     284                 :            : 
     285                 :            :         // We can't drop the item because we need to test whether it would be
     286                 :            :         // collapsed.
     287                 :         98 :         return false;
     288                 :            :     }
     289                 :            : 
     290                 :            :     /** Process new_item.
     291                 :            :      *
     292                 :            :      *  Conceptually this is "add new_item", but taking into account
     293                 :            :      *  collapsing.
     294                 :            :      */
     295                 :   34341171 :     bool process(Result&& new_item,
     296                 :            :                  ValueStreamDocument& vsdoc) {
     297                 :   34341171 :         update_max_weight(new_item.get_weight());
     298                 :            : 
     299         [ +  + ]:   34341171 :         if (!collapser) {
     300                 :            :             // No collapsing, so just add the item.
     301                 :   34280479 :             add(std::move(new_item));
     302                 :            :         } else {
     303                 :      60692 :             auto res = collapser.check(new_item, vsdoc);
     304      [ +  +  + ]:      60692 :             switch (res) {
     305                 :            :                 case REJECT:
     306                 :       3991 :                     return true;
     307                 :            : 
     308                 :            :                 case REPLACE:
     309                 :            :                     // There was a previous item in the collapse tab so the
     310                 :            :                     // MSet can't be empty.
     311                 :            :                     Assert(!results.empty());
     312                 :            : 
     313                 :            :                     // This is one of the best collapse_max potential MSet
     314                 :            :                     // entries with this key which we've seen so far.  The
     315                 :            :                     // entry with this key which it displaced is still in the
     316                 :            :                     // proto-MSet so replace it.
     317                 :       1870 :                     replace(collapser.old_item, std::move(new_item));
     318                 :       1870 :                     return true;
     319                 :            : 
     320                 :            :                 default:
     321                 :      54831 :                     break;
     322                 :            :             }
     323                 :            : 
     324                 :      54831 :             auto elt = add(std::move(new_item));
     325 [ +  + ][ +  + ]:      54831 :             if (res != EMPTY && elt != Xapian::doccount(-1)) {
     326                 :      54647 :                 collapser.process(res, elt);
     327                 :            :             }
     328                 :            :         }
     329                 :            : 
     330         [ +  + ]:   34335310 :         if (stop_once_full) {
     331 [ +  + ][ +  - ]:     165397 :             if (full() && checked_enough()) {
                 [ +  + ]
     332                 :        115 :                 return false;
     333                 :            :             }
     334                 :            :         }
     335                 :            : 
     336                 :   34335195 :         return true;
     337                 :            :     }
     338                 :            : 
     339                 :            :     // Returns the new item's index, or Xapian::doccount(-1) if not added.
     340                 :   34335310 :     Xapian::doccount add(Result&& item) {
     341                 :   34335310 :         ++known_matching_docs;
     342                 :            : 
     343         [ -  + ]:   34335310 :         if (item.get_weight() < min_weight) {
     344                 :          0 :             return Xapian::doccount(-1);
     345                 :            :         }
     346                 :            : 
     347         [ -  + ]:   34335310 :         if (item.get_weight() > max_weight) {
     348                 :          0 :             update_max_weight(item.get_weight());
     349                 :            :         }
     350                 :            : 
     351         [ +  + ]:   34335310 :         if (results.size() < max_size) {
     352                 :            :             // We're still filling, or just about to become full.
     353                 :   16645960 :             results.push_back(std::move(item));
     354                 :            :             Assert(min_heap.empty());
     355                 :   16645960 :             return results.size() - 1;
     356                 :            :         }
     357                 :            : 
     358         [ +  + ]:   17689350 :         if (min_heap.empty()) {
     359                 :            :             // This breaks if we're collapsing because it moves elements around
     360                 :            :             // but can be used if we aren't (and could be for elements with
     361                 :            :             // no collapse key too - FIXME).
     362         [ +  + ]:     116066 :             if (min_weight_pending) {
     363         [ +  + ]:         15 :                 if (!handle_min_weight_pending()) {
     364                 :          7 :                     results.push_back(std::move(item));
     365                 :          7 :                     return results.size() - 1;
     366                 :            :                 }
     367                 :            :             }
     368                 :            : 
     369         [ +  + ]:     116059 :             if (results.size() == 0) {
     370                 :            :                 // E.g. get_mset(0, 0, 10);
     371                 :         35 :                 return Xapian::doccount(-1);
     372                 :            :             }
     373                 :     116024 :             min_heap.reserve(results.size());
     374         [ +  + ]:   16465932 :             for (Xapian::doccount i = 0; i != results.size(); ++i)
     375         [ +  - ]:   16349908 :                 min_heap.push_back(i);
     376         [ +  - ]:     116024 :             Heap::make(min_heap.begin(), min_heap.end(), MCmpAdaptor(this));
     377 [ +  + ][ +  + ]:     116024 :             if (sort_by == Xapian::Enquire::Internal::REL ||
     378                 :        399 :                 sort_by == Xapian::Enquire::Internal::REL_VAL) {
     379         [ +  + ]:     115709 :                 if (checked_enough()) {
     380                 :     116024 :                     min_weight = results[min_heap.front()].get_weight();
     381                 :            :                 }
     382                 :            :             }
     383                 :            :         }
     384                 :            : 
     385                 :   17689308 :         Xapian::doccount worst_idx = min_heap.front();
     386         [ +  + ]:   17689308 :         if (!mcmp(item, results[worst_idx])) {
     387                 :            :             // The new item is less than what we already had.
     388                 :     260230 :             return Xapian::doccount(-1);
     389                 :            :         }
     390                 :            : 
     391                 :   17429078 :         results[worst_idx] = std::move(item);
     392         [ +  - ]:   17429078 :         Heap::replace(min_heap.begin(), min_heap.end(), MCmpAdaptor(this));
     393 [ +  + ][ +  + ]:   17429078 :         if (sort_by == Xapian::Enquire::Internal::REL ||
     394                 :       3947 :             sort_by == Xapian::Enquire::Internal::REL_VAL) {
     395         [ +  + ]:   17426529 :             if (checked_enough()) {
     396                 :   17424229 :                 min_weight = results[min_heap.front()].get_weight();
     397                 :            :             }
     398                 :            :         }
     399                 :   34335310 :         return worst_idx;
     400                 :            :     }
     401                 :            : 
     402                 :       1870 :     void replace(Xapian::doccount old_item, Result&& b) {
     403         [ +  - ]:       1870 :         results[old_item] = std::move(b);
     404         [ +  + ]:       1870 :         if (min_heap.empty())
     405                 :       1870 :             return;
     406                 :            : 
     407                 :            :         // We need to find the entry in min_heap corresponding to old_item.
     408                 :            :         // The simplest way is just to linear-scan for it, and that's actually
     409                 :            :         // fairly efficient as we're just searching for an integer in a
     410                 :            :         // vector of integers.  The heap structure means that the lowest ranked
     411                 :            :         // entry is first and lower ranked entries will tend to be nearer the
     412                 :            :         // start, so intuitively scanning forwards for an entry which we're
     413                 :            :         // removing because we found a higher ranking one seems sensible, but
     414                 :            :         // I've not actually profiled this.
     415         [ +  - ]:         44 :         auto it = std::find(min_heap.begin(), min_heap.end(), old_item);
     416         [ -  + ]:         44 :         if (rare(it == min_heap.end())) {
     417                 :            :             // min_heap should contain all indices of results.
     418                 :            :             Assert(false);
     419                 :          0 :             return;
     420                 :            :         }
     421                 :            : 
     422                 :            :         // siftdown() here is correct (because it's on a min-heap).
     423         [ +  - ]:         44 :         Heap::siftdown(min_heap.begin(), min_heap.end(), it, MCmpAdaptor(this));
     424                 :            :     }
     425                 :            : 
     426                 :     200816 :     void set_new_min_weight(double min_wt) {
     427         [ +  + ]:     200816 :         if (min_wt <= min_weight)
     428                 :     163450 :             return;
     429                 :            : 
     430                 :      37366 :         min_weight = min_wt;
     431                 :            : 
     432         [ +  + ]:      37366 :         if (results.empty()) {
     433                 :            :             // This method gets called before we start matching to set the
     434                 :            :             // fixed weight_threshold threshold.
     435                 :      17226 :             return;
     436                 :            :         }
     437                 :            : 
     438                 :            : #if 0
     439                 :            :         // FIXME: Is this possible?  set_new_min_weight() from a percentage
     440                 :            :         // threshold can't do this...
     441                 :            :         if (min_wt > max_weight) {
     442                 :            :             // The new threshold invalidates all current entries.
     443                 :            :             results.resize(0);
     444                 :            :             min_heap.resize(0);
     445                 :            :             return;
     446                 :            :         }
     447                 :            : #endif
     448                 :            : 
     449         [ -  + ]:      20140 :         if (!min_heap.empty()) {
     450                 :            :             // If sorting primarily by weight, we could pop the heap while the
     451                 :            :             // tip's weight is < min_wt, but each pop needs 2*log(n)
     452                 :            :             // comparisons, and then pushing replacements for each of those
     453                 :            :             // items needs log(n) comparisons.
     454                 :            :             //
     455                 :            :             // Instead we just discard the heap - if we need to rebuild it,
     456                 :            :             // that'll require 3*n comparisons.  The break even is about 3
     457                 :            :             // discarded items for n=10 or about 5 for n=100, but we may never
     458                 :            :             // need to rebuild the heap.
     459                 :          0 :             min_heap.clear();
     460                 :            :         }
     461                 :            : 
     462                 :            :         // Note that we need to check items against min_weight at some point.
     463                 :      20140 :         min_weight_pending = true;
     464                 :            :     }
     465                 :            : 
     466                 :     149207 :     void finalise_percentages() {
     467 [ +  + ][ +  + ]:     149207 :         if (results.empty() || max_weight == 0.0)
                 [ +  + ]
     468                 :      11066 :             return;
     469                 :            : 
     470                 :     138141 :         percent_scale = max_weight_subqs_matched / double(total_subqs);
     471                 :     138141 :         percent_scale /= max_weight;
     472                 :            :         Assert(percent_scale > 0);
     473         [ +  + ]:     138141 :         if (!percent_threshold) {
     474                 :     120934 :             return;
     475                 :            :         }
     476                 :            : 
     477                 :            :         // Truncate the results if necessary.
     478                 :      17207 :         set_new_min_weight(percent_threshold_factor / percent_scale);
     479         [ +  + ]:      17207 :         if (min_weight_pending) {
     480                 :      17178 :             handle_min_weight_pending(true);
     481                 :            :         }
     482                 :            :     }
     483                 :            : 
     484                 :     149207 :     Xapian::MSet finalise(const Xapian::MatchDecider* mdecider,
     485                 :            :                           Xapian::doccount matches_lower_bound,
     486                 :            :                           Xapian::doccount matches_estimated,
     487                 :            :                           Xapian::doccount matches_upper_bound) {
     488                 :     149207 :         finalise_percentages();
     489                 :            : 
     490                 :            :         AssertRel(matches_estimated, >=, matches_lower_bound);
     491                 :            :         AssertRel(matches_estimated, <=, matches_upper_bound);
     492                 :            : 
     493                 :     149207 :         Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
     494                 :     149207 :         Xapian::doccount uncollapsed_estimated = matches_estimated;
     495                 :     149207 :         Xapian::doccount uncollapsed_upper_bound = matches_upper_bound;
     496                 :            : 
     497         [ +  + ]:     149207 :         if (!full()) {
     498                 :            :             // We didn't get all the results requested, so we know that we've
     499                 :            :             // got all there are, and the bounds and estimate are all equal to
     500                 :            :             // that number.
     501                 :      30675 :             matches_lower_bound = results.size();
     502                 :      30675 :             matches_estimated = matches_lower_bound;
     503                 :      30675 :             matches_upper_bound = matches_lower_bound;
     504                 :            : 
     505                 :            :             // And that should equal known_matching_docs, unless a percentage
     506                 :            :             // threshold caused some matches to be excluded.
     507                 :      30675 :             if (!percent_threshold) {
     508                 :            :                 AssertEq(matches_estimated, known_matching_docs);
     509                 :            :             } else {
     510                 :            :                 AssertRel(matches_estimated, <=, known_matching_docs);
     511                 :            :             }
     512 [ +  + ][ +  + ]:     118532 :         } else if (!collapser && known_matching_docs < check_at_least) {
                 [ +  + ]
     513                 :            :             // Similar to the above, but based on known_matching_docs.
     514                 :        161 :             matches_lower_bound = known_matching_docs;
     515                 :        161 :             matches_estimated = matches_lower_bound;
     516                 :        161 :             matches_upper_bound = matches_lower_bound;
     517                 :            :         } else {
     518                 :            :             // We can end up scaling the estimate more than once, so collect
     519                 :            :             // the scale factors and apply them in one go to avoid rounding
     520                 :            :             // more than once.
     521                 :     118371 :             double estimate_scale = 1.0;
     522                 :     118371 :             double unique_rate = 1.0;
     523                 :            : 
     524         [ +  + ]:     118371 :             if (collapser) {
     525                 :       1554 :                 matches_lower_bound = collapser.get_matches_lower_bound();
     526                 :            : 
     527                 :            :                 Xapian::doccount docs_considered =
     528                 :       1554 :                     collapser.get_docs_considered();
     529                 :       1554 :                 Xapian::doccount dups_ignored = collapser.get_dups_ignored();
     530         [ +  - ]:       1554 :                 if (docs_considered > 0) {
     531                 :            :                     // Scale the estimate by the rate at which we've been
     532                 :            :                     // finding unique documents.
     533                 :       1554 :                     double unique = double(docs_considered - dups_ignored);
     534                 :       1554 :                     unique_rate = unique / double(docs_considered);
     535                 :            :                 }
     536                 :            : 
     537                 :            :                 // We can safely reduce the upper bound by the number of
     538                 :            :                 // duplicates we've ignored.
     539                 :       1554 :                 matches_upper_bound -= dups_ignored;
     540                 :            :             }
     541                 :            : 
     542         [ +  + ]:     118371 :             if (mdecider) {
     543 [ +  + ][ +  + ]:         20 :                 if (!percent_threshold && !collapser) {
                 [ +  + ]
     544         [ +  - ]:          5 :                     if (known_matching_docs > matches_lower_bound) {
     545                 :            :                         // We're not collapsing or doing a percentage
     546                 :            :                         // threshold, so known_matching_docs is a lower bound
     547                 :            :                         // on the total number of matches.
     548                 :          5 :                         matches_lower_bound = known_matching_docs;
     549                 :            :                     }
     550                 :            :                 }
     551                 :            : 
     552                 :         20 :                 Xapian::doccount decider_denied = mdecider->docs_denied_;
     553                 :            :                 Xapian::doccount decider_considered =
     554                 :         20 :                     mdecider->docs_allowed_ + mdecider->docs_denied_;
     555                 :            : 
     556                 :            :                 // Scale the estimate by the rate at which the MatchDecider has
     557                 :            :                 // been accepting documents.
     558         [ +  - ]:         20 :                 if (decider_considered > 0) {
     559                 :         20 :                     double accept = double(decider_considered - decider_denied);
     560                 :         20 :                     double accept_rate = accept / double(decider_considered);
     561                 :         20 :                     estimate_scale *= accept_rate;
     562                 :            :                 }
     563                 :            : 
     564                 :            :                 // If a document is denied by the MatchDecider, it can't be
     565                 :            :                 // found to be a duplicate, so it is safe to also reduce the
     566                 :            :                 // upper bound by the number of documents denied by the
     567                 :            :                 // MatchDecider.
     568                 :         20 :                 matches_upper_bound -= decider_denied;
     569         [ +  + ]:         20 :                 if (collapser) uncollapsed_upper_bound -= decider_denied;
     570                 :            :             }
     571                 :            : 
     572         [ +  + ]:     118371 :             if (percent_threshold) {
     573                 :            :                 // Scale the estimate assuming that document weights are evenly
     574                 :            :                 // distributed from 0 to the maximum weight seen.
     575                 :       1221 :                 estimate_scale *= (1.0 - percent_threshold_factor);
     576                 :            : 
     577                 :            :                 // This is all we can be sure of without additional work.
     578                 :       1221 :                 matches_lower_bound = results.size();
     579                 :            : 
     580         [ +  - ]:       1221 :                 if (collapser) {
     581                 :       1221 :                     uncollapsed_lower_bound = matches_lower_bound;
     582                 :            :                 }
     583                 :            :             }
     584                 :            : 
     585 [ +  + ][ +  + ]:     118371 :             if (collapser && estimate_scale != 1.0) {
                 [ +  + ]
     586                 :            :                 uncollapsed_estimated =
     587                 :       1226 :                     Xapian::doccount(uncollapsed_estimated * estimate_scale +
     588                 :       1226 :                                      0.5);
     589                 :            :             }
     590                 :            : 
     591                 :     118371 :             estimate_scale *= unique_rate;
     592                 :            : 
     593         [ +  + ]:     118371 :             if (estimate_scale != 1.0) {
     594                 :            :                 matches_estimated =
     595                 :       1251 :                     Xapian::doccount(matches_estimated * estimate_scale + 0.5);
     596         [ +  + ]:       1251 :                 if (matches_estimated < matches_lower_bound)
     597                 :       1211 :                     matches_estimated = matches_lower_bound;
     598                 :            :             }
     599                 :            : 
     600 [ +  + ][ +  + ]:     118371 :             if (collapser || mdecider) {
                 [ +  + ]
     601                 :            :                 // Clamp the estimate the range given by the bounds.
     602                 :            :                 AssertRel(matches_lower_bound, <=, matches_upper_bound);
     603                 :            :                 matches_estimated = STD_CLAMP(matches_estimated,
     604                 :            :                                               matches_lower_bound,
     605                 :       1559 :                                               matches_upper_bound);
     606         [ +  - ]:     116812 :             } else if (!percent_threshold) {
     607                 :            :                 AssertRel(known_matching_docs, <=, matches_upper_bound);
     608         [ +  + ]:     116812 :                 if (known_matching_docs > matches_lower_bound)
     609                 :         37 :                     matches_lower_bound = known_matching_docs;
     610         [ +  + ]:     116812 :                 if (known_matching_docs > matches_estimated)
     611                 :         21 :                     matches_estimated = known_matching_docs;
     612                 :            :             }
     613                 :            : 
     614 [ +  + ][ +  + ]:     118371 :             if (collapser && !mdecider && !percent_threshold) {
         [ +  + ][ +  + ]
     615                 :            :                 AssertRel(known_matching_docs, <=, uncollapsed_upper_bound);
     616         [ -  + ]:        328 :                 if (known_matching_docs > uncollapsed_lower_bound)
     617                 :          0 :                     uncollapsed_lower_bound = known_matching_docs;
     618                 :            :             }
     619                 :            :         }
     620                 :            : 
     621 [ +  + ][ +  + ]:     149207 :         if (collapser && matches_lower_bound > uncollapsed_lower_bound) {
                 [ +  + ]
     622                 :            :             // Clamp the uncollapsed bound to be at least the collapsed one.
     623                 :         15 :             uncollapsed_lower_bound = matches_lower_bound;
     624                 :            :         }
     625                 :            : 
     626         [ +  + ]:     149207 :         if (collapser) {
     627                 :            :             // Clamp the estimate to lie within the known bounds.
     628         [ +  + ]:      17781 :             if (uncollapsed_estimated < uncollapsed_lower_bound) {
     629                 :       1211 :                 uncollapsed_estimated = uncollapsed_lower_bound;
     630         [ -  + ]:      16570 :             } else if (uncollapsed_estimated > uncollapsed_upper_bound) {
     631                 :          0 :                 uncollapsed_estimated = uncollapsed_upper_bound;
     632                 :            :             }
     633                 :            :         } else {
     634                 :            :             // When not collapsing the uncollapsed bounds are just the same.
     635                 :     131426 :             uncollapsed_lower_bound = matches_lower_bound;
     636                 :     131426 :             uncollapsed_estimated = matches_estimated;
     637                 :     131426 :             uncollapsed_upper_bound = matches_upper_bound;
     638                 :            :         }
     639                 :            : 
     640                 :            :         // FIXME: Profile using min_heap here (when it's been created) to
     641                 :            :         // handle "first" and perform the sort.
     642         [ +  + ]:     149207 :         if (first != 0) {
     643         [ +  + ]:     114127 :             if (first > results.size()) {
     644                 :         13 :                 results.clear();
     645                 :            :             } else {
     646                 :            :                 // We perform nth_element() on reverse iterators so that the
     647                 :            :                 // unwanted elements end up at the end of items, which means
     648                 :            :                 // that the call to erase() to remove them doesn't have to copy
     649                 :            :                 // any elements.
     650         [ +  - ]:     114114 :                 auto nth = results.rbegin() + first;
     651         [ +  - ]:     114114 :                 std::nth_element(results.rbegin(), nth, results.rend(), mcmp);
     652                 :            :                 // Discard the unwanted elements.
     653         [ +  - ]:     114127 :                 results.erase(results.end() - first, results.end());
     654                 :            :             }
     655                 :            :         }
     656                 :            : 
     657                 :     149207 :         std::sort(results.begin(), results.end(), mcmp);
     658                 :            : 
     659                 :     149207 :         collapser.finalise(min_weight, percent_threshold);
     660                 :            : 
     661                 :            :         // The estimates should lie between the bounds.
     662                 :            :         AssertRel(matches_lower_bound, <=, matches_estimated);
     663                 :            :         AssertRel(matches_estimated, <=, matches_upper_bound);
     664                 :            :         AssertRel(uncollapsed_lower_bound, <=, uncollapsed_estimated);
     665                 :            :         AssertRel(uncollapsed_estimated, <=, uncollapsed_upper_bound);
     666                 :            : 
     667                 :            :         // Collapsing should only reduce the bounds and estimate.
     668                 :            :         AssertRel(matches_lower_bound, <=, uncollapsed_lower_bound);
     669                 :            :         AssertRel(matches_estimated, <=, uncollapsed_estimated);
     670                 :            :         AssertRel(matches_upper_bound, <=, uncollapsed_upper_bound);
     671                 :            : 
     672                 :            :         return Xapian::MSet(new Xapian::MSet::Internal(first,
     673                 :            :                                                        matches_upper_bound,
     674                 :            :                                                        matches_lower_bound,
     675                 :            :                                                        matches_estimated,
     676                 :            :                                                        uncollapsed_upper_bound,
     677                 :            :                                                        uncollapsed_lower_bound,
     678                 :            :                                                        uncollapsed_estimated,
     679                 :            :                                                        max_possible,
     680                 :            :                                                        max_weight,
     681                 :     149207 :                                                        std::move(results),
     682         [ +  - ]:     149207 :                                                        percent_scale * 100.0));
     683                 :            :     }
     684                 :            : };
     685                 :            : 
     686                 :            : #endif // XAPIAN_INCLUDED_PROTOMSET_H

Generated by: LCOV version 1.11