LCOV - code coverage report
Current view: top level - unicode - utf8itor.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 7028d852e609 Lines: 68 68 100.0 %
Date: 2019-02-17 14:59:59 Functions: 6 6 100.0 %
Branches: 70 72 97.2 %

           Branch data     Line data    Source code
       1                 :            : /** @file utf8itor.cc
       2                 :            :  * @brief iterate over a utf8 string.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2006,2007,2010,2013,2015 Olly Betts
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or modify
       7                 :            :  * it under the terms of the GNU General Public License as published by
       8                 :            :  * the Free Software Foundation; either version 2 of the License, or
       9                 :            :  * (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful,
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      19                 :            :  */
      20                 :            : 
      21                 :            : #include <config.h>
      22                 :            : 
      23                 :            : #include <xapian/unicode.h>
      24                 :            : 
      25                 :            : #include <cstring>
      26                 :            : 
      27                 :            : using namespace std;
      28                 :            : 
      29                 :       4510 : static inline bool bad_cont(unsigned char ch) { return (ch & 0xc0) != 0x80; }
      30                 :            : 
      31                 :            : namespace Xapian {
      32                 :            : 
      33                 :            : namespace Unicode {
      34                 :            : 
      35                 :            : // buf should be at least 4 bytes.
      36                 :            : unsigned
      37                 :        620 : nonascii_to_utf8(unsigned ch, char * buf)
      38                 :            : {
      39         [ +  + ]:        620 :     if (ch < 0x800) {
      40                 :         66 :         buf[0] = char(0xc0 | (ch >> 6));
      41                 :         66 :         buf[1] = char(0x80 | (ch & 0x3f));
      42                 :         66 :         return 2;
      43                 :            :     }
      44         [ +  + ]:        554 :     if (ch < 0x10000) {
      45                 :        509 :         buf[0] = char(0xe0 | (ch >> 12));
      46                 :        509 :         buf[1] = char(0x80 | ((ch >> 6) & 0x3f));
      47                 :        509 :         buf[2] = char(0x80 | (ch & 0x3f));
      48                 :        509 :         return 3;
      49                 :            :     }
      50         [ +  + ]:         45 :     if (ch < 0x200000) {
      51                 :         44 :         buf[0] = char(0xf0 | (ch >> 18));
      52                 :         44 :         buf[1] = char(0x80 | ((ch >> 12) & 0x3f));
      53                 :         44 :         buf[2] = char(0x80 | ((ch >> 6) & 0x3f));
      54                 :         44 :         buf[3] = char(0x80 | (ch & 0x3f));
      55                 :         44 :         return 4;
      56                 :            :     }
      57                 :            :     // Unicode doesn't specify any characters above 0x10ffff.
      58                 :            :     // Should we be presented with such a numeric character
      59                 :            :     // entity or similar, we just replace it with nothing.
      60                 :          1 :     return 0;
      61                 :            : }
      62                 :            : 
      63                 :            : }
      64                 :            : 
      65                 :         10 : Utf8Iterator::Utf8Iterator(const char *p_)
      66                 :            : {
      67                 :         10 :     assign(p_, strlen(p_));
      68                 :         10 : }
      69                 :            : 
      70                 :            : bool
      71                 :     954347 : Utf8Iterator::calculate_sequence_length() const XAPIAN_NOEXCEPT
      72                 :            : {
      73                 :            :     // Handle invalid UTF-8, overlong sequences, and truncated sequences as
      74                 :            :     // if the text was actually in ISO-8859-1 since we need to do something
      75                 :            :     // with it, and this seems the most likely reason why we'd have invalid
      76                 :            :     // UTF-8.
      77                 :            : 
      78                 :     954347 :     unsigned char ch = *p;
      79                 :            : 
      80                 :     954347 :     seqlen = 1;
      81                 :            :     // Single byte encoding (0x00-0x7f) or invalid (0x80-0xbf) or overlong
      82                 :            :     // sequence (0xc0-0xc1).
      83                 :            :     //
      84                 :            :     // (0xc0 and 0xc1 would start 2 byte sequences for characters which are
      85                 :            :     // representable in a single byte, and we should not decode these.)
      86         [ +  + ]:     954347 :     if (ch < 0xc2) return (ch < 0x80);
      87                 :            : 
      88         [ +  + ]:       1301 :     if (ch < 0xe0) {
      89   [ +  +  +  + ]:        656 :         if (p + 1 == end || // Not enough bytes
                 [ +  + ]
      90                 :        323 :             bad_cont(p[1])) // Invalid
      91                 :         21 :             return false;
      92                 :        312 :         seqlen = 2;
      93                 :        312 :         return true;
      94                 :            :     }
      95         [ +  + ]:        968 :     if (ch < 0xf0) {
      96 [ +  + ][ +  + ]:       1782 :         if (end - p < 3 || // Not enough bytes
      97 [ +  + ][ +  + ]:       1792 :             bad_cont(p[1]) || bad_cont(p[2]) || // Invalid
                 [ +  + ]
      98         [ +  + ]:         10 :             (p[0] == 0xe0 && p[1] < 0xa0)) // Overlong encoding
      99                 :         15 :             return false;
     100                 :        879 :         seqlen = 3;
     101                 :        879 :         return true;
     102                 :            :     }
     103 [ +  + ][ +  + ]:        140 :     if (ch >= 0xf5 || // Code value above Unicode
     104         [ +  + ]:         59 :         end - p < 4 || // Not enough bytes
     105 [ +  + ][ +  + ]:        112 :         bad_cont(p[1]) || bad_cont(p[2]) || bad_cont(p[3]) || // Invalid
                 [ +  + ]
     106 [ +  + ][ +  + ]:        142 :         (p[0] == 0xf0 && p[1] < 0x90) || // Overlong encoding
                 [ +  + ]
     107         [ +  + ]:          2 :         (p[0] == 0xf4 && p[1] >= 0x90)) // Code value above Unicode
     108                 :         28 :         return false;
     109                 :         46 :     seqlen = 4;
     110                 :         46 :     return true;
     111                 :            : }
     112                 :            : 
     113                 :    1728139 : unsigned Utf8Iterator::operator*() const XAPIAN_NOEXCEPT {
     114         [ +  + ]:    1728139 :     if (p == NULL) return unsigned(-1);
     115         [ +  + ]:    1727969 :     if (seqlen == 0) calculate_sequence_length();
     116                 :    1727969 :     unsigned char ch = *p;
     117         [ +  + ]:    1727969 :     if (seqlen == 1) return ch;
     118         [ +  + ]:       1651 :     if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
     119         [ +  + ]:       1295 :     if (seqlen == 3)
     120                 :       1233 :         return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
     121                 :        124 :     return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
     122                 :         62 :             ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
     123                 :            : }
     124                 :            : 
     125                 :            : unsigned
     126                 :      35397 : Utf8Iterator::strict_deref() const XAPIAN_NOEXCEPT
     127                 :            : {
     128         [ -  + ]:      35397 :     if (p == NULL) return unsigned(-1);
     129         [ +  - ]:      35397 :     if (seqlen == 0) {
     130         [ +  + ]:      35397 :         if (!calculate_sequence_length())
     131                 :         42 :             return unsigned(*p) | 0x80000000;
     132                 :            :     }
     133                 :      35355 :     unsigned char ch = *p;
     134         [ +  + ]:      35355 :     if (seqlen == 1) return ch;
     135         [ +  + ]:        173 :     if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
     136         [ +  + ]:        152 :     if (seqlen == 3)
     137                 :        141 :         return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
     138                 :         22 :     return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
     139                 :         11 :             ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
     140                 :            : }
     141                 :            : 
     142                 :            : }

Generated by: LCOV version 1.11