LCOV - code coverage report
Current view: top level - include/xapian - termgenerator.h (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 7822d31adece Lines: 4 7 57.1 %
Date: 2019-05-23 11:15:29 Functions: 5 6 83.3 %
Branches: 2 4 50.0 %

           Branch data     Line data    Source code
       1                 :            : /** @file termgenerator.h
       2                 :            :  * @brief parse free text and generate terms
       3                 :            :  */
       4                 :            : /* Copyright (C) 2007,2009,2011,2012,2013,2014,2018 Olly Betts
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or modify
       7                 :            :  * it under the terms of the GNU General Public License as published by
       8                 :            :  * the Free Software Foundation; either version 2 of the License, or
       9                 :            :  * (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful,
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      19                 :            :  */
      20                 :            : 
      21                 :            : #ifndef XAPIAN_INCLUDED_TERMGENERATOR_H
      22                 :            : #define XAPIAN_INCLUDED_TERMGENERATOR_H
      23                 :            : 
      24                 :            : #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
      25                 :            : # error "Never use <xapian/termgenerator.h> directly; include <xapian.h> instead."
      26                 :            : #endif
      27                 :            : 
      28                 :            : #include <xapian/intrusive_ptr.h>
      29                 :            : #include <xapian/types.h>
      30                 :            : #include <xapian/unicode.h>
      31                 :            : #include <xapian/visibility.h>
      32                 :            : 
      33                 :            : #include <string>
      34                 :            : 
      35                 :            : namespace Xapian {
      36                 :            : 
      37                 :            : class Document;
      38                 :            : class Stem;
      39                 :            : class Stopper;
      40                 :            : class WritableDatabase;
      41                 :            : 
      42                 :            : /** Parses a piece of text and generate terms.
      43                 :            :  *
      44                 :            :  * This module takes a piece of text and parses it to produce words which are
      45                 :            :  * then used to generate suitable terms for indexing.  The terms generated are
      46                 :            :  * suitable for use with Query objects produced by the QueryParser class.
      47                 :            :  */
      48                 :          8 : class XAPIAN_VISIBILITY_DEFAULT TermGenerator {
      49                 :            :   public:
      50                 :            :     /// @private @internal Class representing the TermGenerator internals.
      51                 :            :     class Internal;
      52                 :            :     /// @private @internal Reference counted internals.
      53                 :            :     Xapian::Internal::intrusive_ptr_nonnull<Internal> internal;
      54                 :            : 
      55                 :            :     /// Copy constructor.
      56                 :            :     TermGenerator(const TermGenerator & o);
      57                 :            : 
      58                 :            :     /// Assignment.
      59                 :            :     TermGenerator & operator=(const TermGenerator & o);
      60                 :            : 
      61                 :            :     /// Move constructor.
      62                 :            :     TermGenerator(TermGenerator && o);
      63                 :            : 
      64                 :            :     /// Move assignment operator.
      65                 :            :     TermGenerator & operator=(TermGenerator && o);
      66                 :            : 
      67                 :            :     /// Default constructor.
      68                 :            :     TermGenerator();
      69                 :            : 
      70                 :            :     /// Destructor.
      71                 :            :     ~TermGenerator();
      72                 :            : 
      73                 :            :     /// Set the Xapian::Stem object to be used for generating stemmed terms.
      74                 :            :     void set_stemmer(const Xapian::Stem & stemmer);
      75                 :            : 
      76                 :            :     /** Set the Xapian::Stopper object to be used for identifying stopwords.
      77                 :            :      *
      78                 :            :      *  Stemmed forms of stopwords aren't indexed, but unstemmed forms still
      79                 :            :      *  are so that searches for phrases including stop words still work.
      80                 :            :      *
      81                 :            :      *  @param stop     The Stopper object to set (default NULL, which means no
      82                 :            :      *                  stopwords).
      83                 :            :      */
      84                 :            :     void set_stopper(const Xapian::Stopper *stop = NULL);
      85                 :            : 
      86                 :            :     /// Set the current document.
      87                 :            :     void set_document(const Xapian::Document & doc);
      88                 :            : 
      89                 :            :     /// Get the current document.
      90                 :            :     const Xapian::Document & get_document() const;
      91                 :            : 
      92                 :            :     /// Set the database to index spelling data to.
      93                 :            :     void set_database(const Xapian::WritableDatabase &db);
      94                 :            : 
      95                 :            :     /// For backward compatibility with Xapian 1.2
      96                 :            :     typedef int flags;
      97                 :            : 
      98                 :            :     /// Flags to OR together and pass to TermGenerator::set_flags().
      99                 :            :     enum {
     100                 :            :         /// Index data required for spelling correction.
     101                 :            :         FLAG_SPELLING = 128, // Value matches QueryParser flag.
     102                 :            : 
     103                 :            :         /** Enable generation of n-grams from CJK text.
     104                 :            :          *
     105                 :            :          *  With this enabled, spans of CJK characters are split into unigrams
     106                 :            :          *  and bigrams, with the unigrams carrying positional information.
     107                 :            :          *  Non-CJK characters are split into words as normal.
     108                 :            :          *
     109                 :            :          *  The corresponding option needs to be passed to QueryParser.
     110                 :            :          *
     111                 :            :          *  Flag added in Xapian 1.3.4 and 1.2.22.  This mode can be
     112                 :            :          *  enabled in 1.2.8 and later by setting environment variable
     113                 :            :          *  XAPIAN_CJK_NGRAM to a non-empty value (but doing so was deprecated
     114                 :            :          *  in 1.4.11).
     115                 :            :          */
     116                 :            :         FLAG_CJK_NGRAM = 2048, // Value matches QueryParser flag.
     117                 :            : 
     118                 :            :         /** Enable generation of words from CJK text.
     119                 :            :          *
     120                 :            :          *  With this enabled, spans of CJK characters are split into CJK
     121                 :            :          *  words using text boundary heuristics. Non-CJK characters are
     122                 :            :          *  split into words as normal.
     123                 :            :          *
     124                 :            :          *  The corresponding option needs to be passed to QueryParser.
     125                 :            :          */
     126                 :            :         FLAG_CJK_WORDS = 4096 // Value matches QueryParser flag
     127                 :            :     };
     128                 :            : 
     129                 :            :     /// Stemming strategies, for use with set_stemming_strategy().
     130                 :            :     typedef enum {
     131                 :            :         STEM_NONE, STEM_SOME, STEM_ALL, STEM_ALL_Z, STEM_SOME_FULL_POS
     132                 :            :     } stem_strategy;
     133                 :            : 
     134                 :            :     /// Stopper strategies, for use with set_stopper_strategy().
     135                 :            :     typedef enum { STOP_NONE, STOP_ALL, STOP_STEMMED } stop_strategy;
     136                 :            : 
     137                 :            :     /** Set flags.
     138                 :            :      *
     139                 :            :      *  The new value of flags is: (flags & mask) ^ toggle
     140                 :            :      *
     141                 :            :      *  To just set the flags, pass the new flags in toggle and the
     142                 :            :      *  default value for mask.
     143                 :            :      *
     144                 :            :      *  @param toggle   Flags to XOR.
     145                 :            :      *  @param mask     Flags to AND with first.
     146                 :            :      *
     147                 :            :      *  @return         The old flags setting.
     148                 :            :      */
     149                 :            :     flags set_flags(flags toggle, flags mask = flags(0));
     150                 :            : 
     151                 :            :     /** Set the stemming strategy.
     152                 :            :      *
     153                 :            :      *  This method controls how the stemming algorithm is applied.  It was
     154                 :            :      *  new in Xapian 1.3.1.
     155                 :            :      *
     156                 :            :      *  @param strategy The strategy to use - possible values are:
     157                 :            :      *   - STEM_NONE:   Don't perform any stemming - only unstemmed terms
     158                 :            :      *                  are generated.
     159                 :            :      *   - STEM_SOME:   Generate both stemmed (with a "Z" prefix) and unstemmed
     160                 :            :      *                  terms.  No positional information is stored for
     161                 :            :      *                  unstemmed terms.  This is the default strategy.
     162                 :            :      *   - STEM_SOME_FULL_POS:
     163                 :            :      *                  Like STEM_SOME but positional information is stored
     164                 :            :      *                  for both stemmed and unstemmed terms.  Added in Xapian
     165                 :            :      *                  1.4.8.
     166                 :            :      *   - STEM_ALL:    Generate only stemmed terms (but without a "Z" prefix).
     167                 :            :      *   - STEM_ALL_Z:  Generate only stemmed terms (with a "Z" prefix).
     168                 :            :      */
     169                 :            :     void set_stemming_strategy(stem_strategy strategy);
     170                 :            : 
     171                 :            :     /** Set the stopper strategy.
     172                 :            :      *
     173                 :            :      *  The method controls how the stopper is used.  It was added in Xapian
     174                 :            :      *  1.4.1.
     175                 :            :      *
     176                 :            :      *  You need to also call @a set_stopper() for this to have any effect.
     177                 :            :      *
     178                 :            :      *  @param strategy The strategy to use - possible values are:
     179                 :            :      *   - STOP_NONE:     Don't use the stopper.
     180                 :            :      *   - STOP_ALL:      If a word is identified as a stop word, skip it
     181                 :            :      *                    completely.
     182                 :            :      *   - STOP_STEMMED:  If a word is identified as a stop word, index its
     183                 :            :      *                    unstemmed form but skip the stem.  Unstemmed forms
     184                 :            :      *                    are indexed with positional information by default,
     185                 :            :      *                    so this allows searches for phrases containing
     186                 :            :      *                    stopwords to be supported.  (This is the default
     187                 :            :      *                    mode).
     188                 :            :      */
     189                 :            :     void set_stopper_strategy(stop_strategy strategy);
     190                 :            : 
     191                 :            :     /** Set the maximum length word to index.
     192                 :            :      *
     193                 :            :      *  The limit is on the length of a word prior to stemming and prior to
     194                 :            :      *  adding any term prefix.
     195                 :            :      *
     196                 :            :      *  The backends mostly impose a limit on the length of terms (often of
     197                 :            :      *  about 240 bytes), but it's generally useful to have a lower limit to
     198                 :            :      *  help prevent the index being bloated by useless junk terms from trying
     199                 :            :      *  to indexing things like binary data, uuencoded data, ASCII art, etc.
     200                 :            :      *
     201                 :            :      *  This method was new in Xapian 1.3.1.
     202                 :            :      *
     203                 :            :      *  @param max_word_length  The maximum length word to index, in bytes in
     204                 :            :      *                          UTF-8 representation.  Default is 64.
     205                 :            :      */
     206                 :            :     void set_max_word_length(unsigned max_word_length);
     207                 :            : 
     208                 :            :     /** Index some text.
     209                 :            :      *
     210                 :            :      * @param itor      Utf8Iterator pointing to the text to index.
     211                 :            :      * @param wdf_inc   The wdf increment (default 1).
     212                 :            :      * @param prefix    The term prefix to use (default is no prefix).
     213                 :            :      */
     214                 :            :     void index_text(const Xapian::Utf8Iterator & itor,
     215                 :            :                     Xapian::termcount wdf_inc = 1,
     216                 :            :                     const std::string & prefix = std::string());
     217                 :            : 
     218                 :            :     /** Index some text in a std::string.
     219                 :            :      *
     220                 :            :      * @param text      The text to index.
     221                 :            :      * @param wdf_inc   The wdf increment (default 1).
     222                 :            :      * @param prefix    The term prefix to use (default is no prefix).
     223                 :            :      */
     224                 :        117 :     void index_text(const std::string & text,
     225                 :            :                     Xapian::termcount wdf_inc = 1,
     226                 :            :                     const std::string & prefix = std::string()) {
     227         [ +  + ]:        117 :         index_text(Utf8Iterator(text), wdf_inc, prefix);
     228                 :        107 :     }
     229                 :            : 
     230                 :            :     /** Index some text without positional information.
     231                 :            :      *
     232                 :            :      * Just like index_text, but no positional information is generated.  This
     233                 :            :      * means that the database will be significantly smaller, but that phrase
     234                 :            :      * searching and NEAR won't be supported.
     235                 :            :      *
     236                 :            :      * @param itor      Utf8Iterator pointing to the text to index.
     237                 :            :      * @param wdf_inc   The wdf increment (default 1).
     238                 :            :      * @param prefix    The term prefix to use (default is no prefix).
     239                 :            :      */
     240                 :            :     void index_text_without_positions(const Xapian::Utf8Iterator & itor,
     241                 :            :                                       Xapian::termcount wdf_inc = 1,
     242                 :            :                                       const std::string & prefix = std::string());
     243                 :            : 
     244                 :            :     /** Index some text in a std::string without positional information.
     245                 :            :      *
     246                 :            :      * Just like index_text, but no positional information is generated.  This
     247                 :            :      * means that the database will be significantly smaller, but that phrase
     248                 :            :      * searching and NEAR won't be supported.
     249                 :            :      *
     250                 :            :      * @param text      The text to index.
     251                 :            :      * @param wdf_inc   The wdf increment (default 1).
     252                 :            :      * @param prefix    The term prefix to use (default is no prefix).
     253                 :            :      */
     254                 :          0 :     void index_text_without_positions(const std::string & text,
     255                 :            :                                       Xapian::termcount wdf_inc = 1,
     256                 :            :                                       const std::string & prefix = std::string()) {
     257         [ #  # ]:          0 :         index_text_without_positions(Utf8Iterator(text), wdf_inc, prefix);
     258                 :          0 :     }
     259                 :            : 
     260                 :            :     /** Increase the term position used by index_text.
     261                 :            :      *
     262                 :            :      *  This can be used between indexing text from different fields or other
     263                 :            :      *  places to prevent phrase searches from spanning between them (e.g.
     264                 :            :      *  between the title and body text, or between two chapters in a book).
     265                 :            :      *
     266                 :            :      *  @param delta    Amount to increase the term position by (default: 100).
     267                 :            :      */
     268                 :            :     void increase_termpos(Xapian::termpos delta = 100);
     269                 :            : 
     270                 :            :     /// Get the current term position.
     271                 :            :     Xapian::termpos get_termpos() const;
     272                 :            : 
     273                 :            :     /** Set the current term position.
     274                 :            :      *
     275                 :            :      *  @param termpos  The new term position to set.
     276                 :            :      */
     277                 :            :     void set_termpos(Xapian::termpos termpos);
     278                 :            : 
     279                 :            :     /// Return a string describing this object.
     280                 :            :     std::string get_description() const;
     281                 :            : };
     282                 :            : 
     283                 :            : }
     284                 :            : 
     285                 :            : #endif // XAPIAN_INCLUDED_TERMGENERATOR_H

Generated by: LCOV version 1.11