LCOV - code coverage report
Current view: top level - include/xapian - unicode.h (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 7822d31adece Lines: 70 71 98.6 %
Date: 2019-05-23 11:15:29 Functions: 22 22 100.0 %
Branches: 25 30 83.3 %

           Branch data     Line data    Source code
       1                 :            : /** @file unicode.h
       2                 :            :  * @brief Unicode and UTF-8 related classes and functions.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2018 Olly Betts
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or modify
       7                 :            :  * it under the terms of the GNU General Public License as published by
       8                 :            :  * the Free Software Foundation; either version 2 of the License, or
       9                 :            :  * (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful,
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      19                 :            :  */
      20                 :            : 
      21                 :            : #ifndef XAPIAN_INCLUDED_UNICODE_H
      22                 :            : #define XAPIAN_INCLUDED_UNICODE_H
      23                 :            : 
      24                 :            : #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
      25                 :            : # error "Never use <xapian/unicode.h> directly; include <xapian.h> instead."
      26                 :            : #endif
      27                 :            : 
      28                 :            : #include <xapian/attributes.h>
      29                 :            : #include <xapian/visibility.h>
      30                 :            : 
      31                 :            : #include <string>
      32                 :            : 
      33                 :            : namespace Xapian {
      34                 :            : 
      35                 :            : /** An iterator which returns Unicode character values from a UTF-8 encoded
      36                 :            :  *  string.
      37                 :            :  */
      38                 :            : class XAPIAN_VISIBILITY_DEFAULT Utf8Iterator {
      39                 :            :     const unsigned char *p;
      40                 :            :     const unsigned char *end;
      41                 :            :     mutable unsigned seqlen;
      42                 :            : 
      43                 :            :     bool XAPIAN_NOTHROW(calculate_sequence_length() const);
      44                 :            : 
      45                 :            :     unsigned get_char() const;
      46                 :            : 
      47                 :     125353 :     Utf8Iterator(const unsigned char *p_, const unsigned char *end_, unsigned seqlen_)
      48                 :     125353 :         : p(p_), end(end_), seqlen(seqlen_) { }
      49                 :            : 
      50                 :            :   public:
      51                 :            :     /** Return the raw const char * pointer for the current position. */
      52                 :      91337 :     const char * raw() const {
      53         [ +  + ]:      91337 :         return reinterpret_cast<const char *>(p ? p : end);
      54                 :            :     }
      55                 :            : 
      56                 :            :     /** Return the number of bytes left in the iterator's buffer. */
      57         [ +  + ]:       6580 :     size_t left() const { return p ? end - p : 0; }
      58                 :            : 
      59                 :            :     /** Assign a new string to the iterator.
      60                 :            :      *
      61                 :            :      *  The iterator will forget the string it was iterating through, and
      62                 :            :      *  return characters from the start of the new string when next called.
      63                 :            :      *  The string is not copied into the iterator, so it must remain valid
      64                 :            :      *  while the iteration is in progress.
      65                 :            :      *
      66                 :            :      *  @param p_ A pointer to the start of the string to read.
      67                 :            :      *
      68                 :            :      *  @param len The length of the string to read.
      69                 :            :      */
      70                 :     170947 :     void assign(const char *p_, size_t len) {
      71         [ +  - ]:     170947 :         if (len) {
      72                 :     170947 :             p = reinterpret_cast<const unsigned char*>(p_);
      73                 :     170947 :             end = p + len;
      74                 :     170947 :             seqlen = 0;
      75                 :            :         } else {
      76                 :          0 :             p = NULL;
      77                 :            :         }
      78                 :     170947 :     }
      79                 :            : 
      80                 :            :     /** Assign a new string to the iterator.
      81                 :            :      *
      82                 :            :      *  The iterator will forget the string it was iterating through, and
      83                 :            :      *  return characters from the start of the new string when next called.
      84                 :            :      *  The string is not copied into the iterator, so it must remain valid
      85                 :            :      *  while the iteration is in progress.
      86                 :            :      *
      87                 :            :      *  @param s The string to read.  Must not be modified while the iteration
      88                 :            :      *           is in progress.
      89                 :            :      */
      90                 :            :     void assign(const std::string &s) { assign(s.data(), s.size()); }
      91                 :            : 
      92                 :            :     /** Create an iterator given a pointer to a null terminated string.
      93                 :            :      *
      94                 :            :      *  The iterator will return characters from the start of the string when
      95                 :            :      *  next called.  The string is not copied into the iterator, so it must
      96                 :            :      *  remain valid while the iteration is in progress.
      97                 :            :      *
      98                 :            :      *  @param p_ A pointer to the start of the null terminated string to read.
      99                 :            :      */
     100                 :            :     explicit Utf8Iterator(const char *p_);
     101                 :            : 
     102                 :            :     /** Create an iterator given a pointer and a length.
     103                 :            :      *
     104                 :            :      *  The iterator will return characters from the start of the string when
     105                 :            :      *  next called.  The string is not copied into the iterator, so it must
     106                 :            :      *  remain valid while the iteration is in progress.
     107                 :            :      *
     108                 :            :      *  @param p_ A pointer to the start of the string to read.
     109                 :            :      *
     110                 :            :      *  @param len The length of the string to read.
     111                 :            :      */
     112                 :       2462 :     Utf8Iterator(const char *p_, size_t len) { assign(p_, len); }
     113                 :            : 
     114                 :            :     /** Create an iterator given a string.
     115                 :            :      *
     116                 :            :      *  The iterator will return characters from the start of the string when
     117                 :            :      *  next called.  The string is not copied into the iterator, so it must
     118                 :            :      *  remain valid while the iteration is in progress.
     119                 :            :      *
     120                 :            :      *  @param s The string to read.  Must not be modified while the iteration
     121                 :            :      *           is in progress.
     122                 :            :      */
     123                 :     339412 :     Utf8Iterator(const std::string &s) { assign(s.data(), s.size()); }
     124                 :            : 
     125                 :            :     /** Create an iterator which is at the end of its iteration.
     126                 :            :      *
     127                 :            :      *  This can be compared to another iterator to check if the other iterator
     128                 :            :      *  has reached its end.
     129                 :            :      */
     130                 :     497204 :     XAPIAN_NOTHROW(Utf8Iterator())
     131                 :     497204 :         : p(NULL), end(0), seqlen(0) { }
     132                 :            : 
     133                 :            :     /** Get the current Unicode character value pointed to by the iterator.
     134                 :            :      *
     135                 :            :      *  If an invalid UTF-8 sequence is encountered, then the byte values
     136                 :            :      *  comprising it are returned until valid UTF-8 or the end of the input is
     137                 :            :      *  reached.
     138                 :            :      *
     139                 :            :      *  Returns unsigned(-1) if the iterator has reached the end of its buffer.
     140                 :            :      */
     141                 :            :     unsigned XAPIAN_NOTHROW(operator*() const) XAPIAN_PURE_FUNCTION;
     142                 :            : 
     143                 :            :     /** @private @internal Get the current Unicode character
     144                 :            :      *  value pointed to by the iterator.
     145                 :            :      *
     146                 :            :      *  If an invalid UTF-8 sequence is encountered, then the byte values
     147                 :            :      *  comprising it are returned with the top bit set (so the caller can
     148                 :            :      *  differentiate these from the same values arising from valid UTF-8)
     149                 :            :      *  until valid UTF-8 or the end of the input is reached.
     150                 :            :      *
     151                 :            :      *  Returns unsigned(-1) if the iterator has reached the end of its buffer.
     152                 :            :      */
     153                 :            :     unsigned XAPIAN_NOTHROW(strict_deref() const) XAPIAN_PURE_FUNCTION;
     154                 :            : 
     155                 :            :     /** Move forward to the next Unicode character.
     156                 :            :      *
     157                 :            :      *  @return An iterator pointing to the position before the move.
     158                 :            :      */
     159                 :     125353 :     Utf8Iterator operator++(int) {
     160                 :            :         // If we've not calculated seqlen yet, do so.
     161         [ +  + ]:     125353 :         if (seqlen == 0) calculate_sequence_length();
     162                 :     125353 :         const unsigned char *old_p = p;
     163                 :     125353 :         unsigned old_seqlen = seqlen;
     164                 :     125353 :         p += seqlen;
     165         [ +  + ]:     125353 :         if (p == end) p = NULL;
     166                 :     125353 :         seqlen = 0;
     167                 :     125353 :         return Utf8Iterator(old_p, end, old_seqlen);
     168                 :            :     }
     169                 :            : 
     170                 :            :     /** Move forward to the next Unicode character.
     171                 :            :      *
     172                 :            :      *  @return A reference to this object.
     173                 :            :      */
     174                 :     884691 :     Utf8Iterator & operator++() {
     175         [ +  + ]:     884691 :         if (seqlen == 0) calculate_sequence_length();
     176                 :     884691 :         p += seqlen;
     177         [ +  + ]:     884691 :         if (p == end) p = NULL;
     178                 :     884691 :         seqlen = 0;
     179                 :     884691 :         return *this;
     180                 :            :     }
     181                 :            : 
     182                 :            :     /** Test two Utf8Iterators for equality.
     183                 :            :      *
     184                 :            :      *  @param other    The Utf8Iterator to compare this one with.
     185                 :            :      *  @return true iff the iterators point to the same position.
     186                 :            :      */
     187                 :     288748 :     bool XAPIAN_NOTHROW(operator==(const Utf8Iterator &other) const) {
     188                 :     288748 :         return p == other.p;
     189                 :            :     }
     190                 :            : 
     191                 :            :     /** Test two Utf8Iterators for inequality.
     192                 :            :      *
     193                 :            :      *  @param other    The Utf8Iterator to compare this one with.
     194                 :            :      *  @return true iff the iterators do not point to the same position.
     195                 :            :      */
     196                 :    1483968 :     bool XAPIAN_NOTHROW(operator!=(const Utf8Iterator &other) const) {
     197                 :    1483968 :         return p != other.p;
     198                 :            :     }
     199                 :            : 
     200                 :            :     /// We implement the semantics of an STL input_iterator.
     201                 :            :     //@{
     202                 :            :     typedef std::input_iterator_tag iterator_category;
     203                 :            :     typedef unsigned value_type;
     204                 :            :     typedef size_t difference_type;
     205                 :            :     typedef const unsigned * pointer;
     206                 :            :     typedef const unsigned & reference;
     207                 :            :     //@}
     208                 :            : };
     209                 :            : 
     210                 :            : /// Functions associated with handling Unicode characters.
     211                 :            : namespace Unicode {
     212                 :            : 
     213                 :            : /** Each Unicode character is in exactly one of these categories.
     214                 :            :  *
     215                 :            :  * The Unicode standard calls this the "General Category", and uses a
     216                 :            :  * "Major, minor" convention to derive a two letter code.
     217                 :            :  */
     218                 :            : typedef enum {
     219                 :            :     UNASSIGNED,                         /**< Other, not assigned (Cn) */
     220                 :            :     UPPERCASE_LETTER,                   /**< Letter, uppercase (Lu) */
     221                 :            :     LOWERCASE_LETTER,                   /**< Letter, lowercase (Ll) */
     222                 :            :     TITLECASE_LETTER,                   /**< Letter, titlecase (Lt) */
     223                 :            :     MODIFIER_LETTER,                    /**< Letter, modifier (Lm) */
     224                 :            :     OTHER_LETTER,                       /**< Letter, other (Lo) */
     225                 :            :     NON_SPACING_MARK,                   /**< Mark, nonspacing (Mn) */
     226                 :            :     ENCLOSING_MARK,                     /**< Mark, enclosing (Me) */
     227                 :            :     COMBINING_SPACING_MARK,             /**< Mark, spacing combining (Mc) */
     228                 :            :     DECIMAL_DIGIT_NUMBER,               /**< Number, decimal digit (Nd) */
     229                 :            :     LETTER_NUMBER,                      /**< Number, letter (Nl) */
     230                 :            :     OTHER_NUMBER,                       /**< Number, other (No) */
     231                 :            :     SPACE_SEPARATOR,                    /**< Separator, space (Zs) */
     232                 :            :     LINE_SEPARATOR,                     /**< Separator, line (Zl) */
     233                 :            :     PARAGRAPH_SEPARATOR,                /**< Separator, paragraph (Zp) */
     234                 :            :     CONTROL,                            /**< Other, control (Cc) */
     235                 :            :     FORMAT,                             /**< Other, format (Cf) */
     236                 :            :     PRIVATE_USE,                        /**< Other, private use (Co) */
     237                 :            :     SURROGATE,                          /**< Other, surrogate (Cs) */
     238                 :            :     CONNECTOR_PUNCTUATION,              /**< Punctuation, connector (Pc) */
     239                 :            :     DASH_PUNCTUATION,                   /**< Punctuation, dash (Pd) */
     240                 :            :     OPEN_PUNCTUATION,                   /**< Punctuation, open (Ps) */
     241                 :            :     CLOSE_PUNCTUATION,                  /**< Punctuation, close (Pe) */
     242                 :            :     INITIAL_QUOTE_PUNCTUATION,          /**< Punctuation, initial quote (Pi) */
     243                 :            :     FINAL_QUOTE_PUNCTUATION,            /**< Punctuation, final quote (Pf) */
     244                 :            :     OTHER_PUNCTUATION,                  /**< Punctuation, other (Po) */
     245                 :            :     MATH_SYMBOL,                        /**< Symbol, math (Sm) */
     246                 :            :     CURRENCY_SYMBOL,                    /**< Symbol, currency (Sc) */
     247                 :            :     MODIFIER_SYMBOL,                    /**< Symbol, modified (Sk) */
     248                 :            :     OTHER_SYMBOL                        /**< Symbol, other (So) */
     249                 :            : } category;
     250                 :            : 
     251                 :            : namespace Internal {
     252                 :            :     /** @private @internal Extract the information about a character from the
     253                 :            :      *  Unicode character tables.
     254                 :            :      *
     255                 :            :      *  Characters outside of the Unicode range (i.e. ch >= 0x110000) are
     256                 :            :      *  treated as UNASSIGNED with no case variants.
     257                 :            :      */
     258                 :            :     XAPIAN_VISIBILITY_DEFAULT
     259                 :            :     int XAPIAN_NOTHROW(get_character_info(unsigned ch)) XAPIAN_CONST_FUNCTION;
     260                 :            : 
     261                 :            :     /** @private @internal Bit-masks for case conversion.
     262                 :            :      *
     263                 :            :      *  If the respective bit is set in the return value of
     264                 :            :      *  get_character_info() then the delta value also contained in that
     265                 :            :      *  return values needs adding/subtracting to convert to lower/upper
     266                 :            :      *  case.
     267                 :            :      */
     268                 :            :     enum { INFO_TOLOWER_MASK = 0x40, INFO_TOUPPER_MASK = 0x80 };
     269                 :            : 
     270                 :            :     /** @private @internal Extract the category of a Unicode character from its
     271                 :            :      *  info.
     272                 :            :      */
     273                 :    1550192 :     inline category get_category(int info) { return static_cast<category>(info & 0x1f); }
     274                 :            : 
     275                 :            :     /** @private @internal Extract the delta to use for case conversion of a
     276                 :            :      *  character from its info.
     277                 :            :      */
     278                 :       7550 :     inline int get_delta(int info) {
     279                 :            :         /* It's implementation defined if sign extension happens when right
     280                 :            :          * shifting a signed int, although in practice sign extension is what
     281                 :            :          * most compilers implement.
     282                 :            :          *
     283                 :            :          * Some compilers are smart enough to spot common idioms for sign
     284                 :            :          * extension, but not all (e.g. GCC < 7 doesn't spot the one used in
     285                 :            :          * the else below), so check what the implementation defined behaviour
     286                 :            :          * is with a constant conditional which should get optimised away.
     287                 :            :          */
     288                 :            :         if ((-1 >> 1) == -1) {
     289                 :            :             // Right shift sign-extends.
     290                 :       3775 :             return info >> 8;
     291                 :            :         } else {
     292                 :            :             // Right shift shifts in zeros, not before and after the shift for
     293                 :            :             // negative values.
     294                 :            :             return (info >= 0) ? (info >> 8) : (~(~info >> 8));
     295                 :            :         }
     296                 :            :     }
     297                 :            : }
     298                 :            : 
     299                 :            : /** Convert a single non-ASCII Unicode character to UTF-8.
     300                 :            :  *
     301                 :            :  *  This is intended mainly as a helper method for to_utf8().
     302                 :            :  *
     303                 :            :  *  @param ch   The character (which must be > 128) to write to @a buf.
     304                 :            :  *  @param buf  The buffer to write the character to - it must have
     305                 :            :  *              space for (at least) 4 bytes.
     306                 :            :  *
     307                 :            :  *  @return     The length of the resultant UTF-8 character in bytes.
     308                 :            :  */
     309                 :            : XAPIAN_VISIBILITY_DEFAULT
     310                 :            : unsigned nonascii_to_utf8(unsigned ch, char * buf);
     311                 :            : 
     312                 :            : /** Convert a single Unicode character to UTF-8.
     313                 :            :  *
     314                 :            :  *  @param ch   The character to write to @a buf.
     315                 :            :  *  @param buf  The buffer to write the character to - it must have
     316                 :            :  *              space for (at least) 4 bytes.
     317                 :            :  *
     318                 :            :  *  @return     The length of the resultant UTF-8 character in bytes.
     319                 :            :  */
     320                 :     708486 : inline unsigned to_utf8(unsigned ch, char *buf) {
     321         [ +  + ]:     708486 :     if (ch < 128) {
     322                 :     707862 :         *buf = static_cast<unsigned char>(ch);
     323                 :     707862 :         return 1;
     324                 :            :     }
     325                 :        624 :     return Xapian::Unicode::nonascii_to_utf8(ch, buf);
     326                 :            : }
     327                 :            : 
     328                 :            : /** Append the UTF-8 representation of a single Unicode character to a
     329                 :            :  *  std::string.
     330                 :            :  */
     331                 :     708486 : inline void append_utf8(std::string &s, unsigned ch) {
     332                 :            :     char buf[4];
     333 [ +  - ][ +  - ]:     708486 :     s.append(buf, to_utf8(ch, buf));
     334                 :     708486 : }
     335                 :            : 
     336                 :            : /// Return the category which a given Unicode character falls into.
     337                 :     775096 : inline category get_category(unsigned ch) {
     338                 :     775096 :     return Internal::get_category(Internal::get_character_info(ch));
     339                 :            : }
     340                 :            : 
     341                 :            : /// Test if a given Unicode character is "word character".
     342                 :     486844 : inline bool is_wordchar(unsigned ch) {
     343                 :            :     const unsigned int WORDCHAR_MASK =
     344                 :            :             (1 << Xapian::Unicode::UPPERCASE_LETTER) |
     345                 :            :             (1 << Xapian::Unicode::LOWERCASE_LETTER) |
     346                 :            :             (1 << Xapian::Unicode::TITLECASE_LETTER) |
     347                 :            :             (1 << Xapian::Unicode::MODIFIER_LETTER) |
     348                 :            :             (1 << Xapian::Unicode::OTHER_LETTER) |
     349                 :            :             (1 << Xapian::Unicode::NON_SPACING_MARK) |
     350                 :            :             (1 << Xapian::Unicode::ENCLOSING_MARK) |
     351                 :            :             (1 << Xapian::Unicode::COMBINING_SPACING_MARK) |
     352                 :            :             (1 << Xapian::Unicode::DECIMAL_DIGIT_NUMBER) |
     353                 :            :             (1 << Xapian::Unicode::LETTER_NUMBER) |
     354                 :            :             (1 << Xapian::Unicode::OTHER_NUMBER) |
     355                 :     486844 :             (1 << Xapian::Unicode::CONNECTOR_PUNCTUATION);
     356                 :     486844 :     return ((WORDCHAR_MASK >> get_category(ch)) & 1);
     357                 :            : }
     358                 :            : 
     359                 :            : /// Test if a given Unicode character is a whitespace character.
     360                 :     237360 : inline bool is_whitespace(unsigned ch) {
     361                 :            :     const unsigned int WHITESPACE_MASK =
     362                 :            :             (1 << Xapian::Unicode::CONTROL) | // For TAB, CR, LF, FF.
     363                 :            :             (1 << Xapian::Unicode::SPACE_SEPARATOR) |
     364                 :            :             (1 << Xapian::Unicode::LINE_SEPARATOR) |
     365                 :     237360 :             (1 << Xapian::Unicode::PARAGRAPH_SEPARATOR);
     366                 :     237360 :     return ((WHITESPACE_MASK >> get_category(ch)) & 1);
     367                 :            : }
     368                 :            : 
     369                 :            : /// Test if a given Unicode character is a currency symbol.
     370                 :        498 : inline bool is_currency(unsigned ch) {
     371                 :        498 :     return (get_category(ch) == Xapian::Unicode::CURRENCY_SYMBOL);
     372                 :            : }
     373                 :            : 
     374                 :            : /// Convert a Unicode character to lowercase.
     375                 :     284430 : inline unsigned tolower(unsigned ch) {
     376                 :     284430 :     int info = Xapian::Unicode::Internal::get_character_info(ch);
     377         [ +  + ]:     284430 :     if (!(info & Internal::INFO_TOLOWER_MASK))
     378                 :     280707 :         return ch;
     379                 :       3723 :     return ch + Internal::get_delta(info);
     380                 :            : }
     381                 :            : 
     382                 :            : /// Convert a Unicode character to uppercase.
     383                 :        171 : inline unsigned toupper(unsigned ch) {
     384                 :        171 :     int info = Xapian::Unicode::Internal::get_character_info(ch);
     385         [ +  + ]:        171 :     if (!(info & Internal::INFO_TOUPPER_MASK))
     386                 :        119 :         return ch;
     387                 :         52 :     return ch - Internal::get_delta(info);
     388                 :            : }
     389                 :            : 
     390                 :            : /// Convert a UTF-8 std::string to lowercase.
     391                 :            : inline std::string
     392                 :      89258 : tolower(const std::string &term)
     393                 :            : {
     394                 :      89258 :     std::string result;
     395         [ +  - ]:      89258 :     result.reserve(term.size());
     396         [ +  + ]:     357509 :     for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
     397         [ +  - ]:     268251 :         append_utf8(result, tolower(*i));
     398                 :            :     }
     399                 :      89258 :     return result;
     400                 :            : }
     401                 :            : 
     402                 :            : /// Convert a UTF-8 std::string to uppercase.
     403                 :            : inline std::string
     404                 :            : toupper(const std::string &term)
     405                 :            : {
     406                 :            :     std::string result;
     407                 :            :     result.reserve(term.size());
     408                 :            :     for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
     409                 :            :         append_utf8(result, toupper(*i));
     410                 :            :     }
     411                 :            :     return result;
     412                 :            : }
     413                 :            : 
     414                 :            : }
     415                 :            : 
     416                 :            : }
     417                 :            : 
     418                 :            : #endif // XAPIAN_INCLUDED_UNICODE_H

Generated by: LCOV version 1.11