LCOV - code coverage report
Current view: top level - api - editdistance.h (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 7822d31adece Lines: 19 19 100.0 %
Date: 2019-05-23 11:15:29 Functions: 3 3 100.0 %
Branches: 12 14 85.7 %

           Branch data     Line data    Source code
       1                 :            : /** @file editdistance.h
       2                 :            :  * @brief Edit distance calculation algorithm.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2003 Richard Boulton
       5                 :            :  * Copyright (C) 2007,2008,2017,2019 Olly Betts
       6                 :            :  *
       7                 :            :  * This program is free software; you can redistribute it and/or modify
       8                 :            :  * it under the terms of the GNU General Public License as published by
       9                 :            :  * the Free Software Foundation; either version 2 of the License, or
      10                 :            :  * (at your option) any later version.
      11                 :            :  *
      12                 :            :  * This program is distributed in the hope that it will be useful,
      13                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      14                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      15                 :            :  * GNU General Public License for more details.
      16                 :            :  *
      17                 :            :  * You should have received a copy of the GNU General Public License
      18                 :            :  * along with this program; if not, write to the Free Software
      19                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      20                 :            :  */
      21                 :            : 
      22                 :            : #ifndef XAPIAN_INCLUDED_EDITDISTANCE_H
      23                 :            : #define XAPIAN_INCLUDED_EDITDISTANCE_H
      24                 :            : 
      25                 :            : #include <cstdlib>
      26                 :            : #include <climits>
      27                 :            : #include <vector>
      28                 :            : 
      29                 :            : #include "omassert.h"
      30                 :            : #include "xapian/unicode.h"
      31                 :            : 
      32                 :            : /** Calculate edit distances to a target string.
      33                 :            :  *
      34                 :            :  *  Edit distance is defined as the minimum number of edit operations
      35                 :            :  *  required to move from one sequence to another.  The edit operations
      36                 :            :  *  considered are:
      37                 :            :  *   - Insertion of a character at an arbitrary position.
      38                 :            :  *   - Deletion of a character at an arbitrary position.
      39                 :            :  *   - Substitution of a character at an arbitrary position.
      40                 :            :  *   - Transposition of two neighbouring characters at an arbitrary position
      41                 :            :  *     in the string.
      42                 :            :  */
      43                 :            : class EditDistanceCalculator {
      44                 :            :     /// Don't allow assignment.
      45                 :            :     EditDistanceCalculator& operator=(const EditDistanceCalculator&) = delete;
      46                 :            : 
      47                 :            :     /// Don't allow copying.
      48                 :            :     EditDistanceCalculator(const EditDistanceCalculator&) = delete;
      49                 :            : 
      50                 :            :     /// Target in UTF-32.
      51                 :            :     std::vector<unsigned> target;
      52                 :            : 
      53                 :            :     /// Current candidate in UTF-32.
      54                 :            :     mutable std::vector<unsigned> utf32;
      55                 :            : 
      56                 :            :     mutable int* array = nullptr;
      57                 :            : 
      58                 :            :     // We sum the character frequency histogram absolute differences to compute
      59                 :            :     // a lower bound on the edit distance.  Rather than counting each Unicode
      60                 :            :     // code point uniquely, we use an array with VEC_SIZE elements and tally
      61                 :            :     // code points modulo VEC_SIZE which can only reduce the bound we
      62                 :            :     // calculate.
      63                 :            :     //
      64                 :            :     // There will be a trade-off between how good the bound is and how large
      65                 :            :     // and array is used (a larger array takes more time to clear and sum
      66                 :            :     // over).  The value 64 is somewhat arbitrary - it works as well as 128 for
      67                 :            :     // the testsuite but that may not reflect real world performance.
      68                 :            :     // FIXME: profile and tune.
      69                 :            :     static constexpr int VEC_SIZE = 64;
      70                 :            : 
      71                 :            :     /** Frequency histogram for target sequence.
      72                 :            :      *
      73                 :            :      *  Note: C++ will default initialise all remaining elements.
      74                 :            :      */
      75                 :            :     int target_freqs[VEC_SIZE] = { 0 };
      76                 :            : 
      77                 :            :     /** Calculate edit distance.
      78                 :            :      *
      79                 :            :      *  Internal helper - the cheap case is inlined from the header.
      80                 :            :      */
      81                 :            :     int calc(const unsigned* ptr, int len, int max_distance) const;
      82                 :            : 
      83                 :            :   public:
      84                 :            :     /** Constructor.
      85                 :            :      *
      86                 :            :      *  @param target_  Target string to calculate edit distances to.
      87                 :            :      */
      88                 :            :     explicit
      89                 :        830 :     EditDistanceCalculator(const std::string& target_) {
      90                 :            :         using Xapian::Utf8Iterator;
      91         [ +  + ]:       2466 :         for (Utf8Iterator it(target_); it != Utf8Iterator(); ++it) {
      92                 :       2051 :             unsigned ch = *it;
      93         [ +  - ]:       2051 :             target.push_back(ch);
      94                 :       2051 :             ++target_freqs[ch % VEC_SIZE];
      95                 :            :         }
      96                 :        415 :     }
      97                 :            : 
      98                 :        830 :     ~EditDistanceCalculator() {
      99         [ +  + ]:        415 :         delete [] array;
     100                 :        415 :     }
     101                 :            : 
     102                 :            :     /** Calculate edit distance for a sequence.
     103                 :            :      *
     104                 :            :      *  @param candidate        String to calculate edit distance for.
     105                 :            :      *  @param max_distance     The greatest edit distance that's interesting
     106                 :            :      *                          to us.  If the true edit distance is >
     107                 :            :      *                          max_distance, any value > max_distance may be
     108                 :            :      *                          returned instead (which allows the edit
     109                 :            :      *                          distance algorithm to avoid work for poor
     110                 :            :      *                          matches).  The value passed for subsequent
     111                 :            :      *                          calls to this method on the same object must be
     112                 :            :      *                          the same or less.
     113                 :            :      *
     114                 :            :      *  @return The edit distance between candidate and the target.
     115                 :            :      */
     116                 :       4321 :     int operator()(const std::string& candidate, int max_distance) const {
     117                 :            :         // There's no point considering a word where the difference in length
     118                 :            :         // is greater than the smallest number of edits we've found so far.
     119                 :            :         //
     120                 :            :         // First check based on the encoded UTF-8 length of the candidate.
     121                 :            :         // Each Unicode codepoint is 1-4 bytes in UTF-8 and one word in UTF-32,
     122                 :            :         // so the number of UTF-32 characters in candidate must be >= int(bytes
     123                 :            :         // + 3 / 4) and <= bytes.
     124         [ +  + ]:       4321 :         if (target.size() > candidate.size() + max_distance) {
     125                 :            :             // Candidate too short.
     126                 :        961 :             return INT_MAX;
     127                 :            :         }
     128         [ +  + ]:       3360 :         if (target.size() + max_distance < candidate.size() * 3 / 4) {
     129                 :            :             // Candidate too long.
     130                 :        124 :             return INT_MAX;
     131                 :            :         }
     132                 :            : 
     133                 :            :         // Now convert to UTF-32.
     134         [ +  - ]:       3236 :         utf32.assign(Xapian::Utf8Iterator(candidate), Xapian::Utf8Iterator());
     135                 :            : 
     136                 :            :         // Check a cheap length-based lower bound based on UTF-32 lengths.
     137                 :       3236 :         int lb = std::abs(int(utf32.size()) - int(target.size()));
     138         [ +  + ]:       3236 :         if (lb > max_distance) {
     139                 :        484 :             return lb;
     140                 :            :         }
     141                 :            : 
     142                 :            :         // Actually calculate the edit distance.
     143                 :       4321 :         return calc(&utf32[0], utf32.size(), max_distance);
     144                 :            :     }
     145                 :            : };
     146                 :            : 
     147                 :            : #endif // XAPIAN_INCLUDED_EDITDISTANCE_H

Generated by: LCOV version 1.11