LCOV - code coverage report
Current view: top level - matcher - collapser.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 7028d852e609 Lines: 78 83 94.0 %
Date: 2019-02-17 14:59:59 Functions: 10 11 90.9 %
Branches: 47 59 79.7 %

           Branch data     Line data    Source code
       1                 :            : /** @file collapser.cc
       2                 :            :  * @brief Collapse documents with the same collapse key during the match.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2009,2011,2017 Olly Betts
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or
       7                 :            :  * modify it under the terms of the GNU General Public License as
       8                 :            :  * published by the Free Software Foundation; either version 2 of the
       9                 :            :  * License, or (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful,
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      19                 :            :  */
      20                 :            : 
      21                 :            : #include <config.h>
      22                 :            : 
      23                 :            : #include "collapser.h"
      24                 :            : 
      25                 :            : #include "heap.h"
      26                 :            : #include "omassert.h"
      27                 :            : 
      28                 :            : #include <algorithm>
      29                 :            : 
      30                 :            : using namespace std;
      31                 :            : 
      32                 :            : collapse_result
      33                 :      22626 : CollapseData::check_item(const vector<Result>& results,
      34                 :            :                          const Result& result,
      35                 :            :                          Xapian::doccount collapse_max, MSetCmp mcmp,
      36                 :            :                          Xapian::doccount& old_item)
      37                 :            : {
      38         [ +  + ]:      22626 :     if (items.size() < collapse_max) {
      39                 :      16765 :         return ADD;
      40                 :            :     }
      41                 :            : 
      42                 :            :     // We already have collapse_max items better than result so we need to
      43                 :            :     // eliminate the lowest ranked.
      44 [ +  + ][ +  + ]:       5861 :     if (collapse_count == 0 && collapse_max != 1) {
      45                 :            :         // Be lazy about calling building the heap - if we see <= collapse_max
      46                 :            :         // items with a particular collapse key, we never need to use the heap.
      47                 :            :         Heap::make(items.begin(), items.end(),
      48                 :            :                    [&](pair<Xapian::doccount, Xapian::docid> a,
      49                 :       2339 :                        pair<Xapian::doccount, Xapian::docid> b) {
      50                 :       2339 :                        return mcmp(results[a.first], results[b.first]);
      51                 :       3725 :                    });
      52                 :            :     }
      53                 :       5861 :     ++collapse_count;
      54                 :            : 
      55                 :       5861 :     Xapian::doccount old_item_candidate = items.front().first;
      56                 :       5861 :     const Result& old_result = results[old_item_candidate];
      57         [ +  + ]:       5861 :     if (items.front().second != old_result.get_docid()) {
      58                 :            :         // The previous result with this collapse key we were going to replace
      59                 :            :         // has been pushed out of the protomset by higher ranking results.
      60                 :            :         //
      61                 :            :         // Awkwardly that means we don't know its exact weight, but we
      62                 :            :         // only need next_best_weight to know if we can zero collapse_count
      63                 :            :         // when there's a percentage cut-off, so setting it to an overestimate
      64                 :            :         // is OK, and it's not critically important as it makes no difference
      65                 :            :         // at all unless there's a percentage cut-off set.
      66                 :            :         //
      67                 :            :         // For now use the new result's weight.  FIXME: The lowest weight in
      68                 :            :         // the current MSet would be a tighter bound if we're sorting primarily
      69                 :            :         // by weight (if we're not, then is next_best_weight useful at all?)
      70                 :            :         // We could also check other entries in items to find an upper bound
      71                 :            :         // weight.
      72                 :         30 :         next_best_weight = result.get_weight();
      73                 :            : 
      74                 :         30 :         return REPLACE;
      75                 :            :     }
      76                 :            : 
      77         [ +  + ]:       5831 :     if (mcmp(old_result, result)) {
      78                 :            :         // If this is the "best runner-up", update next_best_weight.
      79         [ +  + ]:       3991 :         if (result.get_weight() > next_best_weight)
      80                 :       2018 :             next_best_weight = result.get_weight();
      81                 :       3991 :         return REJECT;
      82                 :            :     }
      83                 :            : 
      84                 :       1840 :     old_item = old_item_candidate;
      85                 :       1840 :     next_best_weight = old_result.get_weight();
      86                 :            : 
      87                 :       1840 :     items.front() = { old_item, result.get_docid() };
      88                 :            :     Heap::replace(items.begin(), items.end(),
      89                 :            :                   [&](pair<Xapian::doccount, Xapian::docid> a,
      90                 :        652 :                       pair<Xapian::doccount, Xapian::docid> b) {
      91                 :        652 :                       return mcmp(results[a.first], results[b.first]);
      92                 :       2492 :                   });
      93                 :            : 
      94                 :       1840 :     return REPLACE;
      95                 :            : }
      96                 :            : 
      97                 :            : void
      98                 :      37882 : CollapseData::set_item(Xapian::doccount item)
      99                 :            : {
     100                 :            :     AssertEq(items.size(), 1);
     101                 :            :     AssertEq(items[0].first, 0);
     102                 :      37882 :     items[0].first = item;
     103                 :      37882 : }
     104                 :            : 
     105                 :            : void
     106                 :      16765 : CollapseData::add_item(const vector<Result>& results,
     107                 :            :                        Xapian::doccount item,
     108                 :            :                        Xapian::doccount collapse_max,
     109                 :            :                        MSetCmp mcmp)
     110                 :            : {
     111                 :      16765 :     const Result& result = results[item];
     112         [ +  - ]:      16765 :     if (items.size() < collapse_max) {
     113         [ +  - ]:      16765 :         items.emplace_back(item, result.get_docid());
     114                 :      16765 :         return;
     115                 :            :     }
     116                 :            : 
     117                 :          0 :     items.front() = { item, result.get_docid() };
     118                 :            :     Heap::replace(items.begin(), items.end(),
     119                 :            :                   [&](pair<Xapian::doccount, Xapian::docid> a,
     120                 :          0 :                       pair<Xapian::doccount, Xapian::docid> b) {
     121                 :          0 :                       return mcmp(results[a.first], results[b.first]);
     122                 :          0 :                   });
     123                 :            : }
     124                 :            : 
     125                 :            : collapse_result
     126                 :      60692 : Collapser::check(Result& result,
     127                 :            :                  Xapian::Document::Internal& vsdoc)
     128                 :            : {
     129                 :      60692 :     ptr = NULL;
     130                 :      60692 :     ++docs_considered;
     131 [ +  - ][ +  - ]:      60692 :     result.set_collapse_key(vsdoc.get_value(slot));
     132                 :            : 
     133         [ +  + ]:      60692 :     if (result.get_collapse_key().empty()) {
     134                 :            :         // We don't collapse results with an empty collapse key.
     135                 :         96 :         ++no_collapse_key;
     136                 :         96 :         return EMPTY;
     137                 :            :     }
     138                 :            : 
     139                 :            :     // Use dummy value 0 for item - if process() is called, this will get
     140                 :            :     // updated to the appropriate value, and if it isn't then the docid won't
     141                 :            :     // match and we'll know the item isn't in the current proto-mset.
     142                 :      60596 :     auto r = table.emplace(result.get_collapse_key(),
     143   [ +  -  +  - ]:     121192 :                            CollapseData(0, result.get_docid()));
     144                 :      60596 :     ptr = &r.first->second;
     145         [ +  + ]:      60596 :     if (r.second) {
     146                 :            :         // We've not seen this collapse key before.
     147                 :      37970 :         ++entry_count;
     148                 :      37970 :         return NEW;
     149                 :            :     }
     150                 :            : 
     151                 :            :     collapse_result res;
     152                 :      22626 :     CollapseData& collapse_data = *ptr;
     153                 :            :     res = collapse_data.check_item(results, result, collapse_max, mcmp,
     154         [ +  - ]:      22626 :                                    old_item);
     155         [ +  + ]:      22626 :     if (res == ADD) {
     156                 :      16765 :         ++entry_count;
     157 [ +  + ][ +  - ]:       5861 :     } else if (res == REJECT || res == REPLACE) {
     158                 :       5861 :         ++dups_ignored;
     159                 :            :     }
     160                 :      60692 :     return res;
     161                 :            : }
     162                 :            : 
     163                 :            : void
     164                 :      54647 : Collapser::process(collapse_result action,
     165                 :            :                    Xapian::doccount item)
     166                 :            : {
     167      [ +  +  - ]:      54647 :     switch (action) {
     168                 :            :         case NEW:
     169                 :            :             // We've not seen this collapse key before.
     170                 :            :             Assert(ptr);
     171                 :      37882 :             ptr->set_item(item);
     172                 :      37882 :             return;
     173                 :            :         case ADD: {
     174                 :            :             Assert(ptr);
     175                 :      16765 :             ptr->add_item(results, item, collapse_max, mcmp);
     176                 :      16765 :             break;
     177                 :            :         }
     178                 :            :         default:
     179                 :            :             // Shouldn't be called for other actions.
     180                 :            :             Assert(false);
     181                 :            :     }
     182                 :            : }
     183                 :            : 
     184                 :            : Xapian::doccount
     185                 :      50402 : Collapser::get_collapse_count(const string & collapse_key,
     186                 :            :                               int percent_threshold,
     187                 :            :                               double min_weight) const
     188                 :            : {
     189         [ +  - ]:      50402 :     auto key = table.find(collapse_key);
     190                 :            :     // If a collapse key is present in the MSet, it must be in our table.
     191                 :            :     Assert(key != table.end());
     192                 :            : 
     193         [ +  + ]:      50402 :     if (!percent_threshold) {
     194                 :            :         // The recorded collapse_count is correct.
     195                 :       2998 :         return key->second.get_collapse_count();
     196                 :            :     }
     197                 :            : 
     198         [ +  + ]:      47404 :     if (key->second.get_next_best_weight() < min_weight) {
     199                 :            :         // We know for certain that all collapsed items would have failed the
     200                 :            :         // percentage cutoff, so collapse_count should be 0.
     201                 :      42868 :         return 0;
     202                 :            :     }
     203                 :            : 
     204                 :            :     // We know that the highest weighted collapsed item would have survived the
     205                 :            :     // percentage cutoff, but it's possible all other collapsed items would
     206                 :            :     // have failed it.  Since collapse_count is a lower bound, we must set it
     207                 :            :     // to 1.
     208                 :      50402 :     return 1;
     209                 :            : }
     210                 :            : 
     211                 :            : Xapian::doccount
     212                 :       1554 : Collapser::get_matches_lower_bound() const
     213                 :            : {
     214                 :            :     // We've seen this many matches, but all other documents matching the query
     215                 :            :     // could be collapsed onto values already seen.
     216                 :       1554 :     Xapian::doccount matches_lower_bound = no_collapse_key + entry_count;
     217                 :       1554 :     return matches_lower_bound;
     218                 :            :     // FIXME: *Unless* we haven't achieved collapse_max occurrences of *any*
     219                 :            :     // collapse key value, so we can increase matches_lower_bound like the
     220                 :            :     // code below, but it isn't quite that simple - there may not be that
     221                 :            :     // many documents.
     222                 :            : #if 0
     223                 :            :     Xapian::doccount max_kept = 0;
     224                 :            :     for (auto i = table.begin(); i != table.end(); ++i) {
     225                 :            :         if (i->second.get_collapse_count() > max_kept) {
     226                 :            :             max_kept = i->second.get_collapse_count();
     227                 :            :             if (max_kept == collapse_max) {
     228                 :            :                 return matches_lower_bound;
     229                 :            :             }
     230                 :            :         }
     231                 :            :     }
     232                 :            :     return matches_lower_bound + (collapse_max - max_kept);
     233                 :            : #endif
     234                 :            : }
     235                 :            : 
     236                 :            : void
     237                 :     149207 : Collapser::finalise(double min_weight, int percent_threshold)
     238                 :            : {
     239 [ +  + ][ +  + ]:     149207 :     if (table.empty() || results.empty())
                 [ +  + ]
     240                 :     149207 :         return;
     241                 :            : 
     242                 :            :     // We need to fill in collapse_count values in results using the
     243                 :            :     // information stored in table.
     244                 :      17746 :     Xapian::doccount todo = entry_count;
     245         [ +  + ]:      68148 :     for (Result& result : results) {
     246                 :      50402 :         const string& key = result.get_collapse_key();
     247         [ -  + ]:      50402 :         if (key.empty())
     248                 :          0 :             continue;
     249                 :            : 
     250                 :            :         // Fill in collapse_count.
     251                 :            :         result.set_collapse_count(get_collapse_count(key, percent_threshold,
     252         [ +  - ]:      50402 :                                                      min_weight));
     253         [ +  + ]:      50402 :         if (--todo == 0) {
     254                 :            :             // Terminate early if we've handled all non-empty entries.
     255                 :      14114 :             break;
     256                 :            :         }
     257                 :            :     }
     258                 :            : }

Generated by: LCOV version 1.11