LCOV - code coverage report
Current view: top level - matcher - collapser.h (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 954b5873a738 Lines: 29 61 47.5 %
Date: 2019-06-30 05:20:33 Functions: 15 20 75.0 %
Branches: 10 42 23.8 %

           Branch data     Line data    Source code
       1                 :            : /** @file collapser.h
       2                 :            :  * @brief Collapse documents with the same collapse key during the match.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2009,2011,2017 Olly Betts
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or
       7                 :            :  * modify it under the terms of the GNU General Public License as
       8                 :            :  * published by the Free Software Foundation; either version 2 of the
       9                 :            :  * License, or (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful,
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      19                 :            :  */
      20                 :            : 
      21                 :            : #ifndef XAPIAN_INCLUDED_COLLAPSER_H
      22                 :            : #define XAPIAN_INCLUDED_COLLAPSER_H
      23                 :            : 
      24                 :            : #include "backends/documentinternal.h"
      25                 :            : #include "msetcmp.h"
      26                 :            : #include "omassert.h"
      27                 :            : #include "api/postlist.h"
      28                 :            : #include "api/result.h"
      29                 :            : 
      30                 :            : #include <unordered_map>
      31                 :            : #include <vector>
      32                 :            : 
      33                 :            : /// Enumeration reporting how a result will be handled by the Collapser.
      34                 :            : typedef enum {
      35                 :            :     EMPTY,
      36                 :            :     NEW,
      37                 :            :     ADD,
      38                 :            :     REJECT,
      39                 :            :     REPLACE
      40                 :            : } collapse_result;
      41                 :            : 
      42                 :            : /// Class tracking information for a given value of the collapse key.
      43                 :     368328 : class CollapseData {
      44                 :            :     /** Currently kept MSet entries for this value of the collapse key.
      45                 :            :      *
      46                 :            :      *  If collapse_max > 1, then this is a min-heap once collapse_count > 0.
      47                 :            :      *
      48                 :            :      *  The first member of the pair is the index into proto_mset.results
      49                 :            :      *  and the second is the docid of the entry (used to detect if the
      50                 :            :      *  entry in proto_mset.results we were referring to has been dropped).
      51                 :            :      *
      52                 :            :      *  FIXME: We expect collapse_max to be small, so perhaps we should
      53                 :            :      *  preallocate space for that many entries and/or allocate space in
      54                 :            :      *  larger blocks to divvy up?
      55                 :            :      */
      56                 :            :     std::vector<std::pair<Xapian::doccount, Xapian::docid>> items;
      57                 :            : 
      58                 :            :     /// The highest weight of a document we've rejected.
      59                 :            :     double next_best_weight;
      60                 :            : 
      61                 :            :     /// The number of documents we've rejected.
      62                 :            :     Xapian::doccount collapse_count;
      63                 :            : 
      64                 :            :   public:
      65                 :            :     /// Construct with the given item.
      66                 :      61388 :     CollapseData(Xapian::doccount item, Xapian::docid did)
      67 [ +  - ][ +  - ]:      61388 :         : items(1, { item, did }), next_best_weight(0), collapse_count(0) {
      68                 :      61388 :     }
      69                 :            : 
      70                 :            :     /** Check a new result with this collapse key value.
      71                 :            :      *
      72                 :            :      *  If this method determines the action to take is ADD or REPLACE, then
      73                 :            :      *  the proto-mset should be updated and then add_item() called to complete
      74                 :            :      *  the update of the CollapseData (if the result doesn't actually get
      75                 :            :      *  added, then it's OK not to follow up with a call to add_item()).
      76                 :            :      *
      77                 :            :      *  @param results          The results so far.
      78                 :            :      *  @param result           The new result.
      79                 :            :      *  @param collapse_max     Max no. of items for each collapse key value.
      80                 :            :      *  @param mcmp             Result comparison functor.
      81                 :            :      *  @param[out] old_item    Item to be replaced (when REPLACE is returned).
      82                 :            :      *
      83                 :            :      *  @return How to handle @a result: ADD, REJECT or REPLACE.
      84                 :            :      */
      85                 :            :     collapse_result check_item(const std::vector<Result>& results,
      86                 :            :                                const Result& result,
      87                 :            :                                Xapian::doccount collapse_max,
      88                 :            :                                MSetCmp mcmp,
      89                 :            :                                Xapian::doccount& old_item);
      90                 :            : 
      91                 :            :     /** Set item after constructing with a placeholder.
      92                 :            :      *
      93                 :            :      *  @param item             The new item (index into results).
      94                 :            :      */
      95                 :            :     void set_item(Xapian::doccount item);
      96                 :            : 
      97                 :            :     /** Complete update of new result with this collapse key value.
      98                 :            :      *
      99                 :            :      *  @param results          The results so far.
     100                 :            :      *  @param item             The new item (index into results).
     101                 :            :      *  @param collapse_max     Max no. of items for each collapse key value.
     102                 :            :      *  @param mcmp             Result comparison functor.
     103                 :            :      */
     104                 :            :     void add_item(const std::vector<Result>& results,
     105                 :            :                   Xapian::doccount item,
     106                 :            :                   Xapian::doccount collapse_max,
     107                 :            :                   MSetCmp mcmp);
     108                 :            : 
     109                 :            :     /** Process relocation of entry in results.
     110                 :            :      *
     111                 :            :      *  @param from     The old item (index into results).
     112                 :            :      *  @param to       The new item (index into results).
     113                 :            :      */
     114                 :       3605 :     void result_has_moved(Xapian::doccount from, Xapian::doccount to) {
     115         [ +  - ]:       5453 :         for (auto&& item : items) {
     116         [ +  + ]:       5453 :             if (item.first == from) {
     117                 :       3605 :                 item.first = to;
     118                 :       3605 :                 return;
     119                 :            :             }
     120                 :            :         }
     121                 :            :         // The entry ought to be present.
     122                 :            :         Assert(false);
     123                 :            :     }
     124                 :            : 
     125                 :            :     /// The highest weight of a document we've rejected.
     126                 :      94808 :     double get_next_best_weight() const { return next_best_weight; }
     127                 :            : 
     128                 :            :     /// The number of documents we've rejected.
     129                 :       6512 :     Xapian::doccount get_collapse_count() const { return collapse_count; }
     130                 :            : };
     131                 :            : 
     132                 :            : /// The Collapser class tracks collapse keys and the documents they match.
     133                 :     299160 : class Collapser {
     134                 :            :     /// Map from collapse key values to the items we're keeping for them.
     135                 :            :     std::unordered_map<std::string, CollapseData> table;
     136                 :            : 
     137                 :            :     /// How many items we're currently keeping in @a table.
     138                 :            :     Xapian::doccount entry_count = 0;
     139                 :            : 
     140                 :            :     /** How many documents have we seen without a collapse key?
     141                 :            :      *
     142                 :            :      *  We use this statistic to improve matches_lower_bound.
     143                 :            :      */
     144                 :            :     Xapian::doccount no_collapse_key = 0;
     145                 :            : 
     146                 :            :     /** How many documents with duplicate collapse keys we have ignored.
     147                 :            :      *
     148                 :            :      *  We use this statistic to improve matches_estimated (by considering
     149                 :            :      *  the rate of collapsing) and matches_upper_bound.
     150                 :            :      */
     151                 :            :     Xapian::doccount dups_ignored = 0;
     152                 :            : 
     153                 :            :     /** How many documents we've considered for collapsing.
     154                 :            :      *
     155                 :            :      *  We use this statistic to improve matches_estimated (by considering
     156                 :            :      *  the rate of collapsing).
     157                 :            :      */
     158                 :            :     Xapian::doccount docs_considered = 0;
     159                 :            : 
     160                 :            :     /** The value slot we're getting collapse keys from. */
     161                 :            :     Xapian::valueno slot;
     162                 :            : 
     163                 :            :     /** The maximum number of items to keep for each collapse key value. */
     164                 :            :     Xapian::doccount collapse_max;
     165                 :            : 
     166                 :            :     std::vector<Result>& results;
     167                 :            : 
     168                 :            :     MSetCmp mcmp;
     169                 :            : 
     170                 :            :     /** Pointer to CollapseData when NEW or ADD is in progress. */
     171                 :            :     CollapseData* ptr = NULL;
     172                 :            : 
     173                 :            :     /// Adapt @a mcmp to be usable with min_heap.
     174                 :            :     bool operator()(Xapian::doccount a, Xapian::doccount b) const {
     175                 :            :         return mcmp(results[a], results[b]);
     176                 :            :     }
     177                 :            : 
     178                 :            :   public:
     179                 :            :     /// Replaced item when REPLACE is returned by @a collapse().
     180                 :            :     Xapian::doccount old_item = 0;
     181                 :            : 
     182                 :     149580 :     Collapser(Xapian::valueno slot_,
     183                 :            :               Xapian::doccount collapse_max_,
     184                 :            :               std::vector<Result>& results_,
     185                 :            :               MSetCmp mcmp_)
     186                 :            :         : slot(slot_),
     187                 :            :           collapse_max(collapse_max_),
     188                 :            :           results(results_),
     189         [ +  - ]:     149580 :           mcmp(mcmp_) { }
     190                 :            : 
     191                 :            :     /// Return true if collapsing is active for this match.
     192                 :   70500888 :     operator bool() const { return collapse_max != 0; }
     193                 :            : 
     194                 :            :     /** Check a new result.
     195                 :            :      *
     196                 :            :      *  If this method determines the action to take is NEW or ADD then the
     197                 :            :      *  proto-mset should be updated and then process() called to complete the
     198                 :            :      *  update (if the result doesn't actually get added, then it's OK not
     199                 :            :      *  to follow up with a call to process()).
     200                 :            :      *
     201                 :            :      *  @param result   The new result.
     202                 :            :      *  @param vsdoc    Document for getting values.
     203                 :            :      *
     204                 :            :      *  @return How to handle @a result: EMPTY, NEW, ADD, REJECT or REPLACE.
     205                 :            :      */
     206                 :            :     collapse_result check(Result& result,
     207                 :            :                           Xapian::Document::Internal & vsdoc);
     208                 :            : 
     209                 :            :     /** Handle a new Result.
     210                 :            :      *
     211                 :            :      *  @param action   The collapse_result returned by check().
     212                 :            :      *  @param item     The new item (index into results).
     213                 :            :      */
     214                 :            :     void process(collapse_result action, Xapian::doccount item);
     215                 :            : 
     216                 :            :     /** Process relocation of entry in results.
     217                 :            :      *
     218                 :            :      *  @param from             The old item (index into results).
     219                 :            :      *  @param to               The new item (index into results).
     220                 :            :      */
     221                 :       3605 :     void result_has_moved(Xapian::doccount from, Xapian::doccount to) {
     222                 :       3605 :         const string& collapse_key = results[to].get_collapse_key();
     223         [ -  + ]:       3605 :         if (collapse_key.empty()) {
     224                 :       3605 :             return;
     225                 :            :         }
     226         [ +  - ]:       3605 :         auto it = table.find(collapse_key);
     227         [ -  + ]:       3605 :         if (rare(it == table.end())) {
     228                 :            :             // The entry ought to be present.
     229                 :            :             Assert(false);
     230                 :          0 :             return;
     231                 :            :         }
     232                 :            : 
     233                 :       3605 :         CollapseData& collapse_data = it->second;
     234                 :       3605 :         collapse_data.result_has_moved(from, to);
     235                 :            :     }
     236                 :            : 
     237                 :            :     Xapian::doccount get_collapse_count(const std::string & collapse_key,
     238                 :            :                                         int percent_threshold,
     239                 :            :                                         double min_weight) const;
     240                 :            : 
     241                 :       3120 :     Xapian::doccount get_docs_considered() const { return docs_considered; }
     242                 :            : 
     243                 :       3120 :     Xapian::doccount get_dups_ignored() const { return dups_ignored; }
     244                 :            : 
     245                 :            :     Xapian::doccount get_entries() const { return entry_count; }
     246                 :            : 
     247                 :            :     Xapian::doccount get_matches_lower_bound() const;
     248                 :            : 
     249                 :            :     void finalise(double min_weight, int percent_threshold);
     250                 :            : };
     251                 :            : 
     252                 :            : /** Simpler version of Collapser used when merging MSet objects.
     253                 :            :  *
     254                 :            :  *  We merge results in descending rank order, so collapsing is much simpler
     255                 :            :  *  than during the match - we just need to discard documents if we've already
     256                 :            :  *  seen collapse_max with the same key.
     257                 :            :  */
     258                 :         60 : class CollapserLite {
     259                 :            :     /// Map from collapse key values to collapse counts.
     260                 :            :     std::unordered_map<std::string, Xapian::doccount> table;
     261                 :            : 
     262                 :            :     /// How many items we're currently keeping in @a table.
     263                 :            :     Xapian::doccount entry_count = 0;
     264                 :            : 
     265                 :            :     /** How many documents have we seen without a collapse key?
     266                 :            :      *
     267                 :            :      *  We use this statistic to improve matches_lower_bound.
     268                 :            :      */
     269                 :            :     Xapian::doccount no_collapse_key = 0;
     270                 :            : 
     271                 :            :     /** How many documents with duplicate collapse keys we have ignored.
     272                 :            :      *
     273                 :            :      *  We use this statistic to improve matches_estimated (by considering
     274                 :            :      *  the rate of collapsing) and matches_upper_bound.
     275                 :            :      */
     276                 :            :     Xapian::doccount dups_ignored = 0;
     277                 :            : 
     278                 :            :     /** How many documents we've considered for collapsing.
     279                 :            :      *
     280                 :            :      *  We use this statistic to improve matches_estimated (by considering
     281                 :            :      *  the rate of collapsing).
     282                 :            :      */
     283                 :            :     Xapian::doccount docs_considered = 0;
     284                 :            : 
     285                 :            :     /** The maximum number of items to keep for each collapse key value. */
     286                 :            :     Xapian::doccount collapse_max;
     287                 :            : 
     288                 :            :   public:
     289                 :         30 :     CollapserLite(Xapian::doccount collapse_max_)
     290         [ +  - ]:         30 :         : collapse_max(collapse_max_) {}
     291                 :            : 
     292                 :            :     /// Return true if collapsing is active for this match.
     293                 :        316 :     operator bool() const { return collapse_max != 0; }
     294                 :            : 
     295                 :            :     /** Try to add a new key.
     296                 :            :      *
     297                 :            :      *  @return true if accepted; false if rejected.
     298                 :            :      */
     299                 :          0 :     bool add(const std::string& key) {
     300                 :          0 :         ++docs_considered;
     301                 :            : 
     302         [ #  # ]:          0 :         if (key.empty()) {
     303                 :          0 :             ++no_collapse_key;
     304                 :          0 :             return true;
     305                 :            :         }
     306                 :            : 
     307         [ #  # ]:          0 :         auto r = table.emplace(key, 1);
     308         [ #  # ]:          0 :         if (r.second) {
     309                 :            :             // New entry, set to 1.
     310         [ #  # ]:          0 :         } else if (r.first->second == collapse_max) {
     311                 :            :             // Already seen collapse_max with this key so reject.
     312                 :          0 :             ++dups_ignored;
     313                 :          0 :             return false;
     314                 :            :         } else {
     315                 :            :             // Increment count.
     316                 :          0 :             ++r.first->second;
     317                 :            :         }
     318                 :          0 :         ++entry_count;
     319                 :          0 :         return true;
     320                 :            :     }
     321                 :            : 
     322                 :          0 :     Xapian::doccount get_docs_considered() const { return docs_considered; }
     323                 :            : 
     324                 :          0 :     Xapian::doccount get_dups_ignored() const { return dups_ignored; }
     325                 :            : 
     326                 :            :     Xapian::doccount get_entries() const { return entry_count; }
     327                 :            : 
     328                 :          0 :     Xapian::doccount get_matches_lower_bound() const {
     329                 :          0 :         return no_collapse_key + entry_count;
     330                 :            :     }
     331                 :            : 
     332                 :          0 :     void finalise(std::vector<Result>& results, int percent_threshold) {
     333 [ #  # ][ #  # ]:          0 :         if (table.empty() || results.empty())
                 [ #  # ]
     334                 :          0 :             return;
     335                 :            : 
     336                 :            :         // We need to fill in collapse_count values in results using the
     337                 :            :         // information stored in table.
     338                 :          0 :         Xapian::doccount todo = entry_count;
     339         [ #  # ]:          0 :         for (Result& result : results) {
     340                 :          0 :             const string& key = result.get_collapse_key();
     341         [ #  # ]:          0 :             if (key.empty())
     342                 :          0 :                 continue;
     343                 :            : 
     344                 :            :             // Adjust collapse_count.
     345         [ #  # ]:          0 :             if (percent_threshold) {
     346                 :            :                 // FIXME: We can probably do better here.
     347                 :          0 :                 result.set_collapse_count(1);
     348                 :            :             } else {
     349         [ #  # ]:          0 :                 auto c = result.get_collapse_count() + table[key];
     350                 :          0 :                 result.set_collapse_count(c);
     351                 :            :             }
     352                 :            : 
     353         [ #  # ]:          0 :             if (--todo == 0) {
     354                 :            :                 // Terminate early if we've handled all non-empty entries.
     355                 :          0 :                 break;
     356                 :            :             }
     357                 :            :         }
     358                 :            :     }
     359                 :            : };
     360                 :            : 
     361                 :            : #endif // XAPIAN_INCLUDED_COLLAPSER_H

Generated by: LCOV version 1.11