LCOV - code coverage report
Current view: top level - tests - api_unicode.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 954b5873a738 Lines: 228 228 100.0 %
Date: 2019-06-30 05:20:33 Functions: 7 7 100.0 %
Branches: 230 914 25.2 %

           Branch data     Line data    Source code
       1                 :            : /** @file api_unicode.cc
       2                 :            :  * @brief Test the Unicode and UTF-8 classes and functions.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019 Olly Betts
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or modify
       7                 :            :  * it under the terms of the GNU General Public License as published by
       8                 :            :  * the Free Software Foundation; either version 2 of the License, or
       9                 :            :  * (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful,
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      19                 :            :  */
      20                 :            : 
      21                 :            : #include <config.h>
      22                 :            : 
      23                 :            : #include "api_unicode.h"
      24                 :            : 
      25                 :            : #include <xapian.h>
      26                 :            : 
      27                 :            : #include "apitest.h"
      28                 :            : #include "testutils.h"
      29                 :            : 
      30                 :            : #include <cctype>
      31                 :            : 
      32                 :            : using namespace std;
      33                 :            : 
      34                 :            : struct testcase {
      35                 :            :     const char * a, * b;
      36                 :            : };
      37                 :            : 
      38                 :            : static const testcase testcases[] = {
      39                 :            :     { "abcd", "abcd" }, // Sanity check!
      40                 :            :     { "a\x80""bcd", "a\xc2\x80""bcd" },
      41                 :            :     { "a\xa0", "a\xc2\xa0" },
      42                 :            :     { "a\xa0z", "a\xc2\xa0z" },
      43                 :            :     { "x\xc1yz", "x\xc3\x81yz" },
      44                 :            :     { "\xc2z", "\xc3\x82z" },
      45                 :            :     { "\xc2", "\xc3\x82" },
      46                 :            :     { "xy\xc3z", "xy\xc3\x83z" },
      47                 :            :     { "xy\xc3\xc3z", "xy\xc3\x83\xc3\x83z" },
      48                 :            :     { "xy\xc3\xc3", "xy\xc3\x83\xc3\x83" },
      49                 :            :     { "\xe0", "\xc3\xa0" },
      50                 :            :     { "\xe0\x80", "\xc3\xa0\xc2\x80" },
      51                 :            :     { "\xe0\xc0", "\xc3\xa0\xc3\x80" },
      52                 :            :     { "\xe0\xc0z", "\xc3\xa0\xc3\x80z" },
      53                 :            :     { "\xe0\xc0zz", "\xc3\xa0\xc3\x80zz" },
      54                 :            :     { "\xe0\xc0\x81", "\xc3\xa0\xc3\x80\xc2\x81" },
      55                 :            :     { "\xe0\x82\xc1", "\xc3\xa0\xc2\x82\xc3\x81" },
      56                 :            :     { "\xe0\xc5\xc7", "\xc3\xa0\xc3\x85\xc3\x87" },
      57                 :            :     { "\xf0", "\xc3\xb0" },
      58                 :            :     { "\xf0\x80", "\xc3\xb0\xc2\x80" },
      59                 :            :     { "\xf0\xc0", "\xc3\xb0\xc3\x80" },
      60                 :            :     { "\xf0\xc0z", "\xc3\xb0\xc3\x80z" },
      61                 :            :     { "\xf0\xc0zz", "\xc3\xb0\xc3\x80zz" },
      62                 :            :     { "\xf0\xc0\x81", "\xc3\xb0\xc3\x80\xc2\x81" },
      63                 :            :     { "\xf0\x82\xc1", "\xc3\xb0\xc2\x82\xc3\x81" },
      64                 :            :     { "\xf0\xc5\xc7", "\xc3\xb0\xc3\x85\xc3\x87" },
      65                 :            :     { "\xf0\xc0\x81\xc9", "\xc3\xb0\xc3\x80\xc2\x81\xc3\x89" },
      66                 :            :     { "\xf0\x82\xc1\xc8", "\xc3\xb0\xc2\x82\xc3\x81\xc3\x88" },
      67                 :            :     { "\xf0\xc5\xc7\xc6", "\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
      68                 :            :     { "\xf0\xc0\x81\x89", "\xc3\xb0\xc3\x80\xc2\x81\xc2\x89" },
      69                 :            :     { "\xf0\x82\xc1\x88", "\xc3\xb0\xc2\x82\xc3\x81\xc2\x88" },
      70                 :            :     { "\xf0\xc5\xc7\xc6", "\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
      71                 :            :     { "\xf4P\x80\x80", "\xc3\xb4P\xc2\x80\xc2\x80" },
      72                 :            :     { "\xf4\x80P\x80", "\xc3\xb4\xc2\x80P\xc2\x80" },
      73                 :            :     { "\xf4\x80\x80P", "\xc3\xb4\xc2\x80\xc2\x80P" },
      74                 :            :     { "\xfe\xffxyzzy", "\xc3\xbe\xc3\xbfxyzzy" },
      75                 :            :     // Overlong encodings:
      76                 :            :     { "\xc0\x80", "\xc3\x80\xc2\x80" },
      77                 :            :     { "\xc0\xbf", "\xc3\x80\xc2\xbf" },
      78                 :            :     { "\xc1\x80", "\xc3\x81\xc2\x80" },
      79                 :            :     { "\xc1\xbf", "\xc3\x81\xc2\xbf" },
      80                 :            :     { "\xe0\x80\x80", "\xc3\xa0\xc2\x80\xc2\x80" },
      81                 :            :     { "\xe0\x9f\xbf", "\xc3\xa0\xc2\x9f\xc2\xbf" },
      82                 :            :     { "\xf0\x80\x80\x80", "\xc3\xb0\xc2\x80\xc2\x80\xc2\x80" },
      83                 :            :     { "\xf0\x8f\xbf\xbf", "\xc3\xb0\xc2\x8f\xc2\xbf\xc2\xbf" },
      84                 :            :     // Above Unicode:
      85                 :            :     { "\xf4\x90\x80\x80", "\xc3\xb4\xc2\x90\xc2\x80\xc2\x80" },
      86                 :            :     { 0, 0 }
      87                 :            : };
      88                 :            : 
      89                 :            : // Test handling of invalid UTF-8 is as desired.
      90                 :          1 : DEFINE_TESTCASE(utf8iterator1, !backend) {
      91                 :            :     const testcase * p;
      92         [ +  + ]:         46 :     for (p = testcases; p->a; ++p) {
      93 [ +  - ][ +  - ]:         45 :         tout.str(string());
      94 [ +  - ][ +  - ]:         45 :         tout << '"' << p->a << "\" and \"" << p->b << '"' << endl;
         [ +  - ][ +  - ]
         [ +  - ][ +  - ]
      95                 :         45 :         size_t a_len = strlen(p->a);
      96                 :         45 :         Xapian::Utf8Iterator a(p->a, a_len);
      97                 :            : 
      98                 :         45 :         size_t b_len = strlen(p->b);
      99                 :         45 :         Xapian::Utf8Iterator b(p->b, b_len);
     100                 :            : 
     101 [ +  + ][ +  - ]:        190 :         while (a != Xapian::Utf8Iterator() && b != Xapian::Utf8Iterator()) {
         [ +  + ][ +  - ]
                 [ +  + ]
     102 [ -  + ][ #  # ]:        145 :             TEST_EQUAL(*a, *b);
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
                 [ #  # ]
     103                 :        145 :             ++a;
     104                 :        145 :             ++b;
     105                 :            :         }
     106                 :            : 
     107                 :            :         // Test that we don't reach the end of one before the other.
     108 [ -  + ][ #  # ]:         45 :         TEST(a == Xapian::Utf8Iterator());
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
     109 [ -  + ][ #  # ]:         45 :         TEST(b == Xapian::Utf8Iterator());
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
     110                 :            :     }
     111                 :          1 :     return true;
     112                 :            : }
     113                 :            : 
     114                 :            : struct testcase2 {
     115                 :            :     const char * a;
     116                 :            :     unsigned long n;
     117                 :            : };
     118                 :            : 
     119                 :            : static const testcase2 testcases2[] = {
     120                 :            :     { "a", 97 },
     121                 :            :     { "\x80", 128 },
     122                 :            :     { "\xa0", 160 },
     123                 :            :     { "\xc2\x80", 128 },
     124                 :            :     { "\xc2\xa0", 160 },
     125                 :            :     { "\xe0\xa0\x80", 0x0800 },
     126                 :            :     { "\xe1\x80\x80", 0x1000 },
     127                 :            :     { "\xf0\xa8\xa8\x8f", 166415 },
     128                 :            :     { "\xf3\x80\x80\x80", 0x0c0000 },
     129                 :            :     { "\xf4\x80\x80\x80", 0x100000 },
     130                 :            :     { 0, 0 }
     131                 :            : };
     132                 :            : 
     133                 :            : // Test decoding of UTF-8.
     134                 :          1 : DEFINE_TESTCASE(utf8iterator2, !backend) {
     135                 :            :     const testcase2 * p;
     136         [ +  + ]:         11 :     for (p = testcases2; p->a; ++p) {
     137         [ +  - ]:         10 :         Xapian::Utf8Iterator a(p->a);
     138                 :            : 
     139 [ -  + ][ #  # ]:         10 :         TEST(a != Xapian::Utf8Iterator());
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
     140 [ -  + ][ #  # ]:         10 :         TEST_EQUAL(*a, p->n);
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
                 [ #  # ]
     141 [ -  + ][ #  # ]:         10 :         TEST(++a == Xapian::Utf8Iterator());
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
     142                 :            :     }
     143                 :          1 :     return true;
     144                 :            : }
     145                 :            : 
     146                 :            : // Test Unicode categorisation.
     147                 :          1 : DEFINE_TESTCASE(unicode1, !backend) {
     148                 :            :     using namespace Xapian;
     149 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category('a'), Unicode::LOWERCASE_LETTER);
     150 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category('0'), Unicode::DECIMAL_DIGIT_NUMBER);
     151 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category('$'), Unicode::CURRENCY_SYMBOL);
     152 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xa3), Unicode::CURRENCY_SYMBOL);
     153                 :            :     // U+0242 was added in Unicode 5.0.0.
     154 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x242), Unicode::LOWERCASE_LETTER);
     155                 :            :     // U+0526 was added in Unicode 6.0.0.
     156 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x0526), Unicode::UPPERCASE_LETTER);
     157                 :            :     // U+0527 was added in Unicode 6.0.0.
     158 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x0527), Unicode::LOWERCASE_LETTER);
     159                 :            :     // U+0620 was added in Unicode 6.0.0.
     160 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x0620), Unicode::OTHER_LETTER);
     161                 :            :     // U+065F was added in Unicode 6.0.0.
     162 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x065F), Unicode::NON_SPACING_MARK);
     163                 :            :     // U+06DE changed category in Unicode 6.0.0.
     164 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x06DE), Unicode::OTHER_SYMBOL);
     165                 :            :     // U+0840 was added in Unicode 6.0.0.
     166 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x0840), Unicode::OTHER_LETTER);
     167                 :            :     // U+093A was added in Unicode 6.0.0.
     168 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x093A), Unicode::NON_SPACING_MARK);
     169                 :            :     // U+093B was added in Unicode 6.0.0.
     170 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x093B), Unicode::COMBINING_SPACING_MARK);
     171                 :            :     // U+0CF1 changed category in Unicode 6.0.0.
     172 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x0CF1), Unicode::OTHER_LETTER);
     173                 :            :     // U+0CF2 changed category in Unicode 6.0.0.
     174 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x0CF2), Unicode::OTHER_LETTER);
     175                 :            :     // U+11A7 was added in Unicode 5.2.0.
     176 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x11A7), Unicode::OTHER_LETTER);
     177                 :            :     // U+9FCB was added in Unicode 5.2.0.
     178 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x9FCB), Unicode::OTHER_LETTER);
     179                 :            :     // U+FA6C was added in Unicode 5.2.0.
     180 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xFA6C), Unicode::OTHER_LETTER);
     181 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xFFFF), Unicode::UNASSIGNED);
     182                 :            :     // Test characters outside BMP.
     183 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x10345), Unicode::OTHER_LETTER);
     184 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x10FFFD), Unicode::PRIVATE_USE);
     185 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x10FFFF), Unicode::UNASSIGNED);
     186                 :            :     // U+1109A was added in Unicode 5.2.0.
     187 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1109a), Unicode::OTHER_LETTER);
     188                 :            :     // U+1F773 was added in Unicode 6.0.0.
     189 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1F773), Unicode::OTHER_SYMBOL);
     190                 :            :     // U+2B740 was added in Unicode 6.0.0.
     191 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x2B740), Unicode::OTHER_LETTER);
     192                 :            :     // U+2B81D was added in Unicode 6.0.0.
     193 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x2B81D), Unicode::OTHER_LETTER);
     194                 :            :     // U+00A7 changed category in Unicode 6.1.0 (was OTHER_SYMBOL).
     195 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xA7), Unicode::OTHER_PUNCTUATION);
     196                 :            :     // U+00AA changed category in Unicode 6.1.0 (was LOWERCASE_LETTER).
     197 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xAA), Unicode::OTHER_LETTER);
     198                 :            :     // U+00B6 changed category in Unicode 6.1.0 (was OTHER_SYMBOL).
     199 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xB6), Unicode::OTHER_PUNCTUATION);
     200                 :            :     // U+00BA changed category in Unicode 6.1.0 (was LOWERCASE_LETTER).
     201 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xBA), Unicode::OTHER_LETTER);
     202                 :            :     // U+058F was added in Unicode 6.1.0.
     203 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x058F), Unicode::CURRENCY_SYMBOL);
     204                 :            :     // U+0604 was added in Unicode 6.1.0.
     205 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x0604), Unicode::FORMAT);
     206                 :            :     // U+08A0 was added in Unicode 6.1.0.
     207 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x08A0), Unicode::OTHER_LETTER);
     208                 :            :     // U+08E4 was added in Unicode 6.1.0.
     209 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x08E4), Unicode::NON_SPACING_MARK);
     210                 :            :     // U+0AF0 was added in Unicode 6.1.0.
     211 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x0AF0), Unicode::OTHER_PUNCTUATION);
     212                 :            :     // U+9FCC was added in Unicode 6.1.0.
     213 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x9FCC), Unicode::OTHER_LETTER);
     214                 :            :     // U+A7F9 was added in Unicode 6.1.0.
     215 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xA7F9), Unicode::MODIFIER_LETTER);
     216                 :            :     // U+110F0 was added in Unicode 6.1.0.
     217 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x110F0), Unicode::DECIMAL_DIGIT_NUMBER);
     218                 :            :     // U+11100 was added in Unicode 6.1.0.
     219 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x11100), Unicode::NON_SPACING_MARK);
     220                 :            :     // U+1EEF0 was added in Unicode 6.1.0.
     221 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1EEF0), Unicode::MATH_SYMBOL);
     222                 :            :     // U+1F634 was added in Unicode 6.1.0.
     223 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1F634), Unicode::OTHER_SYMBOL);
     224                 :            :     // U+20BA was added in Unicode 6.2.0.
     225 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x20BA), Unicode::CURRENCY_SYMBOL);
     226                 :            :     // U+061C was added in Unicode 6.3.0.
     227 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x61C), Unicode::FORMAT);
     228                 :            :     // U+037F "GREEK CAPITAL LETTER YOT" was added in Unicode 7.0.0.
     229 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x37F), Unicode::UPPERCASE_LETTER);
     230                 :            : 
     231                 :            :     // Added or changed in Unicode 8.0.0:
     232                 :            :     // U+08B3 "ARABIC LETTER AIN WITH THREE DOTS BELOW".
     233 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x8B3), Unicode::OTHER_LETTER);
     234                 :            :     // U+0AF9 "GUJARATI LETTER ZHA".
     235 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xAF9), Unicode::OTHER_LETTER);
     236                 :            :     // U+0C5A "TELUGU LETTER RRRA".
     237 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xC5A), Unicode::OTHER_LETTER);
     238                 :            :     // U+0D5F "MALAYALAM LETTER ARCHAIC II".
     239 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xD5F), Unicode::OTHER_LETTER);
     240                 :            :     // U+13F5 "CHEROKEE LETTER MV".
     241 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x13F5), Unicode::UPPERCASE_LETTER);
     242                 :            :     // U+13F8 "CHEROKEE SMALL LETTER YE".
     243 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x13F8), Unicode::LOWERCASE_LETTER);
     244                 :            :     // U+19B7 "NEW TAI LUE VOWEL SIGN O" changed to be OTHER_LETTER in 8.0.0.
     245 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x19B7), Unicode::OTHER_LETTER);
     246                 :            :     // U+20BE "LARI SIGN".
     247 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x20BE), Unicode::CURRENCY_SYMBOL);
     248                 :            :     // U+218A "TURNED DIGIT TWO".
     249 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x218A), Unicode::OTHER_SYMBOL);
     250                 :            :     // U+10C9C "OLD HUNGARIAN CAPITAL LETTER OO".
     251 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x10C9C), Unicode::UPPERCASE_LETTER);
     252                 :            :     // U+12399 "CUNEIFORM SIGN U U".
     253 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x12399), Unicode::OTHER_LETTER);
     254                 :            :     // U+1D800 "SIGNWRITING HAND-FIST INDEX".
     255 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1D800), Unicode::OTHER_SYMBOL);
     256                 :            : 
     257                 :            :     // Added or changed in Unicode 9.0.0:
     258                 :            :     // U+08B6 "ARABIC LETTER BEH WITH SMALL MEEM ABOVE"
     259 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x8B6), Unicode::OTHER_LETTER);
     260                 :            :     // U+08E2 "ARABIC DISPUTED END OF AYAH"
     261 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x8E2), Unicode::FORMAT);
     262                 :            :     // U+0C80 "KANNADA SIGN SPACING CANDRABINDU"
     263 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xC80), Unicode::OTHER_LETTER);
     264                 :            :     // U+0D56 "MALAYALAM LETTER CHILLU LLL"
     265 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xD56), Unicode::OTHER_LETTER);
     266                 :            :     // U+0D58 "MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH"
     267 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xD58), Unicode::OTHER_NUMBER);
     268                 :            :     // U+1885 "MONGOLIAN LETTER ALI GALI BALUDA"
     269 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1885), Unicode::NON_SPACING_MARK);
     270                 :            :     // U+1886 "MONGOLIAN LETTER ALI GALI THREE BALUDA"
     271 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1886), Unicode::NON_SPACING_MARK);
     272                 :            :     // U+104FB "OSAGE SMALL LETTER ZHA"
     273 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x104FB), Unicode::LOWERCASE_LETTER);
     274                 :            :     // U+1141F "NEWA LETTER TA"
     275 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1141F), Unicode::OTHER_LETTER);
     276                 :            :     // U+1F989 "OWL"
     277 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1F989), Unicode::OTHER_SYMBOL);
     278                 :            : 
     279                 :            :     // Added in Unicode 10.0.0:
     280                 :            :     // U+20BF "BITCOIN SIGN"
     281 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x20BF), Unicode::CURRENCY_SYMBOL);
     282                 :            :     // U+23FF "OBSERVER EYE SYMBOL"
     283 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x23FF), Unicode::OTHER_SYMBOL);
     284                 :            :     // U+1032D "OLD ITALIC LETTER YE"
     285 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1032D), Unicode::OTHER_LETTER);
     286                 :            :     // U+11A34 "ZANABAZAR SQUARE SIGN VIRAMA"
     287 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x11A34), Unicode::NON_SPACING_MARK);
     288                 :            :     // U+1F6F8 "FLYING SAUCER"
     289 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1F6F8), Unicode::OTHER_SYMBOL);
     290                 :            :     // U+1F9E6 "SOCKS"
     291 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1F9E6), Unicode::OTHER_SYMBOL);
     292                 :            : 
     293                 :            :     // Added in Unicode 11.0.0:
     294                 :            :     // U+0560 "ARMENIAN SMALL LETTER TURNED AYB"
     295 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x0560), Unicode::LOWERCASE_LETTER);
     296                 :            :     // U+05EF "HEBREW YOD TRIANGLE"
     297 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x05EF), Unicode::OTHER_LETTER);
     298                 :            :     // U+07FF "NKO TAMAN SIGN"
     299 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x07FF), Unicode::CURRENCY_SYMBOL);
     300                 :            :     // U+08D3 "ARABIC SMALL LOW WAW"
     301 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x08D3), Unicode::NON_SPACING_MARK);
     302                 :            :     // U+1878 "MONGOLIAN LETTER CHA WITH TWO DOTS"
     303 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1878), Unicode::OTHER_LETTER);
     304                 :            :     // U+1F12F "COPYLEFT SYMBOL"
     305 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1F12F), Unicode::OTHER_SYMBOL);
     306                 :            : 
     307                 :            :     // Changed category in Unicode 11.0.0:
     308                 :            :     // U+10D0 "GEORGIAN LETTER AN"
     309 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x10D0), Unicode::LOWERCASE_LETTER);
     310                 :            : 
     311                 :            :     // Added in Unicode 12.0.0:
     312                 :            :     // U+0C77 "TELUGU SIGN SIDDHAM"
     313 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x0C77), Unicode::OTHER_PUNCTUATION);
     314                 :            :     // U+2BC9 "NEPTUNE FORM TWO"
     315 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x2BC9), Unicode::OTHER_SYMBOL);
     316                 :            :     // U+A7C5 "LATIN CAPITAL LETTER S WITH HOOK"
     317 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xA7C5), Unicode::UPPERCASE_LETTER);
     318                 :            :     // U+1FA90 "RINGED PLANET"
     319 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1FA90), Unicode::OTHER_SYMBOL);
     320                 :            : 
     321                 :            :     // Test some invalid Unicode values.
     322 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x110000), Unicode::UNASSIGNED);
     323 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0xFFFFFFFF), Unicode::UNASSIGNED);
     324                 :          1 :     return true;
     325                 :            : }
     326                 :            : 
     327                 :          1 : DEFINE_TESTCASE(caseconvert1, !backend) {
     328                 :            :     using namespace Xapian;
     329         [ +  + ]:        129 :     for (unsigned ch = 0; ch < 128; ++ch) {
     330 [ -  + ][ #  # ]:        128 :         TEST_EQUAL(Unicode::tolower(ch), unsigned(tolower(ch)));
     331 [ -  + ][ #  # ]:        128 :         TEST_EQUAL(Unicode::toupper(ch), unsigned(toupper(ch)));
     332                 :            :     }
     333                 :            : 
     334                 :            :     // U+0242 was added in Unicode 5.0.0 as a lowercase form of U+0241.
     335 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x242), 0x242);
     336 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x242), 0x241);
     337 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x241), 0x241);
     338 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x241), 0x242);
     339                 :            : 
     340                 :            :     // Regression test for bug fixed in 1.2.17.
     341 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x1c5), 0x1c6);
     342 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x1c8), 0x1c9);
     343 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x1cb), 0x1cc);
     344 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x1f2), 0x1f3);
     345                 :            : 
     346                 :            :     // Pound currency symbol:
     347 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0xa3), 0xa3);
     348 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0xa3), 0xa3);
     349                 :            :     // Unassigned:
     350 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0xFFFF), 0xFFFF);
     351 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0xFFFF), 0xFFFF);
     352                 :            :     // Test characters outside BMP.
     353 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x10345), 0x10345);
     354 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x10345), 0x10345);
     355 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x10FFFD), 0x10FFFD);
     356 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x10FFFD), 0x10FFFD);
     357 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x10FFFF), 0x10FFFF);
     358 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x10FFFF), 0x10FFFF);
     359                 :            :     // Test some invalid Unicode values.
     360 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x110000), 0x110000);
     361 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x110000), 0x110000);
     362 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0xFFFFFFFF), 0xFFFFFFFF);
     363 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0xFFFFFFFF), 0xFFFFFFFF);
     364                 :            : 
     365                 :          1 :     return true;
     366                 :            : }
     367                 :            : 
     368                 :            : /// Test Unicode 5.1 and later support.
     369                 :          1 : DEFINE_TESTCASE(caseconvert2, !backend) {
     370                 :            :     using namespace Xapian;
     371                 :            : 
     372 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x250), 0x2c6f);
     373 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x251), 0x2c6d);
     374 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x271), 0x2c6e);
     375                 :            : 
     376 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x2ec), Unicode::MODIFIER_LETTER);
     377 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x374), Unicode::MODIFIER_LETTER);
     378 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x487), Unicode::NON_SPACING_MARK);
     379 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x5be), Unicode::DASH_PUNCTUATION);
     380 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::get_category(0x1f093), Unicode::OTHER_SYMBOL);
     381                 :            : 
     382                 :            :     // U+0526, U+0527 and U+A78D were added in Unicode 6.0.0:
     383 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x265), 0xa78d);
     384 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0xa78d), 0x265);
     385 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x526), 0x527);
     386 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x527), 0x526);
     387                 :            : 
     388                 :            :     // U+A7AA was added in Unicode 6.1.0:
     389 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x266), 0xa7aa);
     390 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0xa7aa), 0x266);
     391 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x526), 0x527);
     392 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x527), 0x526);
     393                 :            : 
     394 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x370), 0x371);
     395 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x371), 0x370);
     396 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x372), 0x373);
     397 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x373), 0x372);
     398 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x376), 0x377);
     399 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x377), 0x376);
     400 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x3cf), 0x3d7);
     401 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x3d7), 0x3cf);
     402                 :            : 
     403                 :            :     // U+20BA was added in Unicode 6.2.0:
     404 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x20ba), 0x20ba);
     405 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x20ba), 0x20ba);
     406                 :            : 
     407                 :            :     // U+061C was added in Unicode 6.3.0:
     408 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x61c), 0x61c);
     409 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x61c), 0x61c);
     410                 :            : 
     411                 :            :     unsigned u;
     412         [ +  + ]:          9 :     for (u = 0x514; u < 0x524; u += 2) {
     413 [ -  + ][ #  # ]:          8 :         TEST_EQUAL(Unicode::get_category(u), Unicode::UPPERCASE_LETTER);
     414 [ -  + ][ #  # ]:          8 :         TEST_EQUAL(Unicode::get_category(u + 1), Unicode::LOWERCASE_LETTER);
     415 [ -  + ][ #  # ]:          8 :         TEST_EQUAL(Unicode::tolower(u), u + 1);
     416 [ -  + ][ #  # ]:          8 :         TEST_EQUAL(Unicode::toupper(u + 1), u);
     417                 :            :     }
     418                 :            : 
     419                 :            :     // U+A7B1 was added in Unicode 8.0.0 as an uppercase form of U+0287.
     420 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0xA7B1), 0x0287);
     421 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0xA7B1), 0xA7B1);
     422 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x0287), 0x0287);
     423 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x0287), 0xA7B1);
     424                 :            : 
     425                 :            :     // U+A7B4 (capital) and U+A7B5 (small) added in Unicode 8.0.0
     426 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0xA7B4), 0xA7B5);
     427 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0xA7B4), 0xA7B4);
     428 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0xA7B5), 0xA7B5);
     429 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0xA7B5), 0xA7B4);
     430                 :            : 
     431                 :            :     // U+A7AE was added in Unicode 9.0.0 as an uppercase form of U+026A.
     432 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0xA7AE), 0x026A);
     433 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0xA7AE), 0xA7AE);
     434 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x026A), 0x026A);
     435 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x026A), 0xA7AE);
     436                 :            : 
     437                 :            :     // U+A7AE was added in Unicode 9.0.0 as an uppercase form of U+026A.
     438 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0xA7AE), 0x026A);
     439 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0xA7AE), 0xA7AE);
     440 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x026A), 0x026A);
     441 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x026A), 0xA7AE);
     442                 :            : 
     443                 :            :     // U+0560 was added in Unicode 11.0.0 (lowercase, no other forms).
     444 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x0560), 0x0560);
     445 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x0560), 0x0560);
     446                 :            : 
     447                 :            :     // U+10D0 changed to be lowercase in Unicode 11.0.0 and U+1C90 was added.
     448 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x10D0), 0x10D0);
     449 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x10D0), 0x1C90);
     450 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x1C90), 0x10D0);
     451 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x1C90), 0x1C90);
     452                 :            : 
     453                 :            :     // U+A7C5 was added in Unicode 12.0.0 as an uppercase form of U+0282.
     454 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0xA7C5), 0x0282);
     455 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0xA7C5), 0xA7C5);
     456 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::tolower(0x0282), 0x0282);
     457 [ -  + ][ #  # ]:          1 :     TEST_EQUAL(Unicode::toupper(0x0282), 0xA7C5);
     458                 :            : 
     459                 :          1 :     return true;
     460                 :            : }
     461                 :            : 
     462                 :          1 : DEFINE_TESTCASE(utf8convert1, !backend) {
     463         [ +  - ]:          1 :     string s;
     464         [ +  - ]:          1 :     Xapian::Unicode::append_utf8(s, 'a');
     465         [ +  - ]:          1 :     Xapian::Unicode::append_utf8(s, 128);
     466         [ +  - ]:          1 :     Xapian::Unicode::append_utf8(s, 160);
     467         [ +  - ]:          1 :     Xapian::Unicode::append_utf8(s, 0xFFFF);
     468         [ +  - ]:          1 :     Xapian::Unicode::append_utf8(s, 166415);
     469         [ +  - ]:          1 :     Xapian::Unicode::append_utf8(s, 0x10345);
     470         [ +  - ]:          1 :     Xapian::Unicode::append_utf8(s, 0x10FFFD);
     471         [ +  - ]:          1 :     Xapian::Unicode::append_utf8(s, 0xFFFFFFFF);
     472         [ +  - ]:          1 :     Xapian::Unicode::append_utf8(s, 'z');
     473 [ +  - ][ -  + ]:          1 :     TEST_STRINGS_EQUAL(s, "a"
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
                 [ #  # ]
     474                 :            :                           "\xc2\x80"
     475                 :            :                           "\xc2\xa0"
     476                 :            :                           "\xef\xbf\xbf"
     477                 :            :                           "\xf0\xa8\xa8\x8f"
     478                 :            :                           "\xf0\x90\x8d\x85"
     479                 :            :                           "\xf4\x8f\xbf\xbd"
     480                 :            :                           ""
     481                 :            :                           "z"
     482                 :            :                           );
     483                 :            : 
     484                 :          1 :     return true;
     485                 :            : }
     486                 :            : 
     487                 :          1 : DEFINE_TESTCASE(unicodepredicates1, !backend) {
     488                 :            :     static const unsigned wordchars[] = {
     489                 :            :         // DECIMAL_DIGIT_NUMBER
     490                 :            :         '0', '7', '9',
     491                 :            :         0x10D30, // (added in Unicode 11.0.0)
     492                 :            :         0x11D51, // (added in Unicode 10.0.0)
     493                 :            :         0x11DA9, // (added in Unicode 11.0.0)
     494                 :            :         // OTHER_NUMBER
     495                 :            :         0x1ECB3, // (added in Unicode 11.0.0)
     496                 :            :         // LOWERCASE_LETTER
     497                 :            :         'a', 'z', 0x250, 0x251, 0x271, 0x3d7,
     498                 :            :         0x242, // (added in Unicode 5.0.0)
     499                 :            :         // LOWERCASE_LETTER (added in Unicode 5.1.0)
     500                 :            :         0x371, 0x373, 0x377, 0x514, 0x516, 0x518, 0x51a, 0x51c, 0x51e,
     501                 :            :         0x520, 0x522,
     502                 :            :         0x16E78, // (added in Unicode 11.0.0)
     503                 :            :         // UPPERCASE_LETTER
     504                 :            :         'A', 'Z', 0x241,
     505                 :            :         // UPPERCASE_LETTER (added in Unicode 5.1.0)
     506                 :            :         0x370, 0x372, 0x376, 0x3cf, 0x515, 0x517, 0x519, 0x51b, 0x51d, 0x51f,
     507                 :            :         0x521, 0x523, 0x2c6d, 0x2c6e, 0x2c6f,
     508                 :            :         0x16E45, // (added in Unicode 11.0.0)
     509                 :            :         // OTHER_LETTER
     510                 :            :         0x8bb, // Added in Unicode 9.0.0
     511                 :            :         0xc80, // Added in Unicode 9.0.0
     512                 :            :         0xe86, // Added in Unicode 12.0.0
     513                 :            :         0x312e, // Added in Unicode 10.0.0
     514                 :            :         0x10345,
     515                 :            :         // MODIFIER_LETTER
     516                 :            :         0x2ec, // Added in Unicode 5.1.0
     517                 :            :         0x374, // Added in Unicode 5.1.0
     518                 :            :         0x16fe1, // Added in Unicode 10.0.0
     519                 :            :         0x16fe3, // Added in Unicode 12.0.0
     520                 :            :         // NON_SPACING_MARK (added to is_wordchar() in 1.1.0)
     521                 :            :         0x651,
     522                 :            :         0x487, // Added in Unicode 5.1.0
     523                 :            :         0x8d3, // Added in Unicode 11.0.0
     524                 :            :         0x8db, // Added in Unicode 9.0.0
     525                 :            :         0xeba, // Added in Unicode 12.0.0
     526                 :            :         0x11d47, // Added in Unicode 10.0.0
     527                 :            :         0
     528                 :            :     };
     529                 :            :     static const unsigned currency[] = {
     530                 :            :         // CURRENCY_SYMBOL
     531                 :            :         '$', 0xa3,
     532                 :            :         // CURRENCY_SYMBOL (added in Unicode 6.2.0)
     533                 :            :         0x20ba,
     534                 :            :         // CURRENCY_SYMBOL (added in Unicode 8.0.0)
     535                 :            :         0x20be,
     536                 :            :         // CURRENCY_SYMBOL (added in Unicode 10.0.0)
     537                 :            :         0x20bf,
     538                 :            :         // CURRENCY_SYMBOL (added in Unicode 11.0.0)
     539                 :            :         0x7fe,
     540                 :            :         // CURRENCY_SYMBOL (added in Unicode 12.0.0)
     541                 :            :         0x1e2ff,
     542                 :            :         0
     543                 :            :     };
     544                 :            :     static const unsigned whitespace[] = {
     545                 :            :         // CONTROL
     546                 :            :         '\t', '\n', '\f', '\r',
     547                 :            :         // SPACE_SEPARATOR
     548                 :            :         ' ',
     549                 :            :         0
     550                 :            :     };
     551                 :            :     static const unsigned other[] = {
     552                 :            :         // DASH_PUNCTUATION (added in Unicode 5.1.0)
     553                 :            :         0x5be,
     554                 :            :         // OTHER_SYMBOL
     555                 :            :         0xd4f, // Added in Unicode 9.0.0
     556                 :            :         0x1f093, // Added in Unicode 5.1.0
     557                 :            :         0x1f263, // Added in Unicode 10.0.0
     558                 :            :         0x1fa62, // Added in Unicode 11.0.0
     559                 :            :         // FORMAT
     560                 :            :         0x61c, // Added in Unicode 6.3.0
     561                 :            :         0x8e2, // Added in Unicode 9.0.0
     562                 :            :         // UNASSIGNED
     563                 :            :         0xffff, 0x10ffff, 0x110000, 0xFFFFFFFF,
     564                 :            :         // PRIVATE_USE
     565                 :            :         0x10fffd,
     566                 :            :         0
     567                 :            :     };
     568                 :            : 
     569         [ +  + ]:         61 :     for (const unsigned * p = wordchars; *p; ++p) {
     570 [ -  + ][ #  # ]:         60 :         TEST(Xapian::Unicode::is_wordchar(*p));
     571 [ -  + ][ #  # ]:         60 :         TEST(!Xapian::Unicode::is_currency(*p));
     572 [ -  + ][ #  # ]:         60 :         TEST(!Xapian::Unicode::is_whitespace(*p));
     573                 :            :     }
     574                 :            : 
     575         [ +  + ]:          8 :     for (const unsigned * p = currency; *p; ++p) {
     576 [ -  + ][ #  # ]:          7 :         TEST(!Xapian::Unicode::is_wordchar(*p));
     577 [ -  + ][ #  # ]:          7 :         TEST(Xapian::Unicode::is_currency(*p));
     578 [ -  + ][ #  # ]:          7 :         TEST(!Xapian::Unicode::is_whitespace(*p));
     579                 :            :     }
     580                 :            : 
     581         [ +  + ]:          6 :     for (const unsigned * p = whitespace; *p; ++p) {
     582 [ -  + ][ #  # ]:          5 :         TEST(!Xapian::Unicode::is_wordchar(*p));
     583 [ -  + ][ #  # ]:          5 :         TEST(!Xapian::Unicode::is_currency(*p));
     584 [ -  + ][ #  # ]:          5 :         TEST(Xapian::Unicode::is_whitespace(*p));
     585                 :            :     }
     586                 :            : 
     587         [ +  + ]:         13 :     for (const unsigned * p = other; *p; ++p) {
     588 [ -  + ][ #  # ]:         12 :         TEST(!Xapian::Unicode::is_wordchar(*p));
     589 [ -  + ][ #  # ]:         12 :         TEST(!Xapian::Unicode::is_currency(*p));
     590 [ -  + ][ #  # ]:         12 :         TEST(!Xapian::Unicode::is_whitespace(*p));
     591                 :            :     }
     592                 :            : 
     593                 :          1 :     return true;
     594                 :            : }

Generated by: LCOV version 1.11