LCOV - code coverage report
Current view: top level - tests - stemtest.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core 954b5873a738 Lines: 89 102 87.3 %
Date: 2019-06-30 05:20:33 Functions: 6 6 100.0 %
Branches: 142 356 39.9 %

           Branch data     Line data    Source code
       1                 :            : /** @file stemtest.cc
       2                 :            :  * @brief Test stemming algorithms
       3                 :            :  */
       4                 :            : /* Copyright 1999,2000,2001 BrightStation PLC
       5                 :            :  * Copyright 2002 Ananova Ltd
       6                 :            :  * Copyright 2002,2003,2004,2007,2008,2009,2012,2015 Olly Betts
       7                 :            :  *
       8                 :            :  * This program is free software; you can redistribute it and/or
       9                 :            :  * modify it under the terms of the GNU General Public License as
      10                 :            :  * published by the Free Software Foundation; either version 2 of the
      11                 :            :  * License, or (at your option) any later version.
      12                 :            :  *
      13                 :            :  * This program is distributed in the hope that it will be useful,
      14                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      15                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      16                 :            :  * GNU General Public License for more details.
      17                 :            :  *
      18                 :            :  * You should have received a copy of the GNU General Public License
      19                 :            :  * along with this program; if not, write to the Free Software
      20                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
      21                 :            :  * USA
      22                 :            :  */
      23                 :            : 
      24                 :            : #include <config.h>
      25                 :            : 
      26                 :            : #include <cstdlib>
      27                 :            : 
      28                 :            : #include <string>
      29                 :            : #include <fstream>
      30                 :            : #include <iostream>
      31                 :            : 
      32                 :            : #include <xapian.h>
      33                 :            : #include "parseint.h"
      34                 :            : #include "testsuite.h"
      35                 :            : 
      36                 :            : using namespace std;
      37                 :            : 
      38                 :            : static const int JUNKSIZE = 2 * 1048576;
      39                 :            : 
      40                 :          1 : static string language;
      41                 :            : 
      42                 :          1 : static Xapian::Stem stemmer;
      43                 :            : 
      44                 :          1 : static string srcdir;
      45                 :            : 
      46                 :            : static int seed;
      47                 :            : 
      48                 :            : // run stemmers on random text
      49                 :            : static bool
      50                 :         29 : test_stemrandom()
      51                 :            : {
      52                 :            :     static const char wordchars[] =
      53                 :            :         "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz0123456789^\0";
      54                 :            : 
      55 [ +  - ][ +  - ]:         29 :     tout << "Stemming random text... (seed " << seed << ")" << endl;
         [ +  - ][ +  - ]
      56                 :         29 :     srand(seed);
      57                 :            : 
      58         [ +  - ]:         29 :     string word;
      59                 :         29 :     int stemmed_size = 0;
      60         [ +  + ]:   60817437 :     for (int c = JUNKSIZE; c; --c) {
      61                 :   60817408 :         char ch = wordchars[(rand() >> 8) % sizeof wordchars];
      62         [ +  + ]:   60817408 :         if (ch) {
      63         [ +  - ]:   58948938 :             word += ch;
      64                 :   58948938 :             continue;
      65                 :            :         }
      66         [ +  - ]:    1868470 :         stemmed_size += stemmer(word).length();
      67         [ +  - ]:    1868470 :         word.resize(0);
      68                 :            :     }
      69         [ +  - ]:         29 :     stemmed_size += stemmer(word).length();
      70 [ +  - ][ +  - ]:         29 :     tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
         [ +  - ][ +  - ]
      71         [ +  - ]:         29 :          << endl;
      72                 :            : 
      73         [ -  + ]:         29 :     if (stemmed_size > JUNKSIZE * 101 / 100) {
      74 [ #  # ][ #  # ]:          0 :         FAIL_TEST("Stemmed data is significantly bigger than input: "
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
                 [ #  # ]
      75                 :            :                   << stemmed_size << " vs. " << JUNKSIZE);
      76                 :            :     }
      77         [ -  + ]:         29 :     if (stemmed_size < JUNKSIZE / 2) {
      78 [ #  # ][ #  # ]:          0 :         FAIL_TEST("Stemmed data is significantly smaller than input: "
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
                 [ #  # ]
      79                 :            :                   << stemmed_size << " vs. " << JUNKSIZE);
      80                 :            :     }
      81                 :         29 :     return true;
      82                 :            : }
      83                 :            : 
      84                 :            : // run stemmers on random junk
      85                 :            : static bool
      86                 :         29 : test_stemjunk()
      87                 :            : {
      88 [ +  - ][ +  - ]:         29 :     tout << "Stemming random junk... (seed " << seed << ")" << endl;
         [ +  - ][ +  - ]
      89                 :         29 :     srand(seed);
      90                 :            : 
      91         [ +  - ]:         29 :     string word;
      92                 :         29 :     int stemmed_size = 0;
      93         [ +  + ]:   60817437 :     for (int c = JUNKSIZE; c; --c) {
      94                 :   60817408 :         char ch = char(rand() >> 8);
      95         [ +  + ]:   60817408 :         if (ch) {
      96         [ +  - ]:   60580188 :             word += ch;
      97                 :   60580188 :             continue;
      98                 :            :         }
      99         [ +  - ]:     237220 :         stemmed_size += stemmer(word).length();
     100         [ +  - ]:     237220 :         word.resize(0);
     101                 :            :     }
     102         [ +  - ]:         29 :     stemmed_size += stemmer(word).length();
     103 [ +  - ][ +  - ]:         29 :     tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
         [ +  - ][ +  - ]
     104         [ +  - ]:         29 :          << endl;
     105                 :            : 
     106         [ -  + ]:         29 :     if (stemmed_size > JUNKSIZE * 101 / 100) {
     107 [ #  # ][ #  # ]:          0 :         FAIL_TEST("Stemmed data is significantly bigger than input ("
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
                 [ #  # ]
     108                 :            :                   << stemmed_size << " vs. " << JUNKSIZE);
     109                 :            :     }
     110         [ -  + ]:         29 :     if (stemmed_size < JUNKSIZE / 2) {
     111 [ #  # ][ #  # ]:          0 :         FAIL_TEST("Stemmed data is significantly smaller than input ("
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
                 [ #  # ]
     112                 :            :                   << stemmed_size << " vs. " << JUNKSIZE);
     113                 :            :     }
     114                 :         29 :     return true;
     115                 :            : }
     116                 :            : 
     117                 :            : static bool
     118                 :         29 : test_stemdict()
     119                 :            : {
     120         [ +  - ]:         29 :     string dir = srcdir + "/../../xapian-data/stemming/";
     121                 :            : 
     122 [ +  - ][ +  - ]:         58 :     ifstream voc((dir + language + "/voc.txt").c_str());
                 [ +  - ]
     123 [ +  - ][ +  + ]:         29 :     if (!voc.is_open()) {
     124 [ +  - ][ +  - ]:          4 :         SKIP_TEST(language << "/voc.txt not found");
         [ +  - ][ +  - ]
                 [ +  - ]
     125                 :            :     }
     126                 :            : 
     127 [ +  - ][ +  - ]:         50 :     ifstream st((dir + language + "/output.txt").c_str());
                 [ +  - ]
     128 [ +  - ][ -  + ]:         25 :     if (!st.is_open()) {
     129         [ #  # ]:          0 :         voc.close();
     130 [ #  # ][ #  # ]:          0 :         FAIL_TEST(language << "/output.txt not found");
         [ #  # ][ #  # ]
                 [ #  # ]
     131                 :            :     }
     132                 :            : 
     133 [ +  - ][ +  - ]:         25 :     tout << "Testing " << language << " with Snowball dictionary..." << endl;
         [ +  - ][ +  - ]
     134                 :            : 
     135                 :         25 :     int pass = 1;
     136                 :            :     while (true) {
     137 [ +  - ][ +  - ]:         52 :         string word, stem, expect;
         [ +  - ][ +  + ]
                 [ +  + ]
     138 [ +  - ][ +  + ]:    1579469 :         while (!voc.eof() && !st.eof()) {
         [ +  - ][ +  - ]
                 [ +  + ]
     139         [ +  - ]:    1579443 :             getline(voc, word);
     140         [ +  - ]:    1579443 :             getline(st, expect);
     141                 :            : 
     142 [ +  - ][ +  - ]:    1579443 :             stem = stemmer(word);
     143                 :            : 
     144 [ -  + ][ #  # ]:    1579443 :             TEST_EQUAL(stem, expect);
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
                 [ #  # ]
     145                 :            :         }
     146         [ +  - ]:         26 :         voc.close();
     147         [ +  - ]:         26 :         st.close();
     148                 :            : 
     149         [ +  + ]:         26 :         if (pass == 2) break;
     150                 :            : 
     151 [ +  - ][ +  - ]:         25 :         voc.open((dir + language + "/voc2.txt").c_str());
                 [ +  - ]
     152 [ +  - ][ +  + ]:         25 :         if (!voc.is_open()) break;
     153                 :            : 
     154 [ +  - ][ +  - ]:          1 :         st.open((dir + language + "/output2.txt").c_str());
                 [ +  - ]
     155 [ +  - ][ -  + ]:          1 :         if (!st.is_open()) {
     156         [ #  # ]:          0 :             voc.close();
     157 [ #  # ][ #  # ]:          0 :             FAIL_TEST(language << "/output2.txt not found");
         [ #  # ][ #  # ]
                 [ #  # ]
     158                 :            :         }
     159 [ +  - ][ +  - ]:          1 :         tout << "Testing " << language << " with supplemental dictionary..."
                 [ +  - ]
     160         [ +  - ]:          1 :              << endl;
     161         [ +  + ]:         26 :         ++pass;
     162                 :          1 :     }
     163                 :            : 
     164                 :         29 :     return true;
     165                 :            : }
     166                 :            : 
     167                 :            : // ##################################################################
     168                 :            : // # End of actual tests                                            #
     169                 :            : // ##################################################################
     170                 :            : 
     171                 :            : /// The lists of tests to perform
     172                 :            : static const test_desc tests[] = {
     173                 :            :     {"stemrandom",            test_stemrandom},
     174                 :            :     {"stemjunk",              test_stemjunk},
     175                 :            :     {"stemdict",              test_stemdict},
     176                 :            :     {0, 0}
     177                 :            : };
     178                 :            : 
     179                 :          1 : int main(int argc, char **argv)
     180                 :            : try {
     181         [ +  - ]:          1 :     string langs = Xapian::Stem::get_available_languages();
     182 [ +  - ][ +  - ]:          1 :     test_driver::add_command_line_option("languages", 'l', &langs);
     183                 :            : 
     184                 :          1 :     seed = 42;
     185         [ +  - ]:          2 :     string seed_str;
     186 [ +  - ][ +  - ]:          1 :     test_driver::add_command_line_option("seed", 's', &seed_str);
     187                 :            : 
     188         [ +  - ]:          1 :     test_driver::parse_command_line(argc, argv);
     189 [ +  - ][ +  - ]:          1 :     srcdir = test_driver::get_srcdir();
     190                 :          1 :     int result = 0;
     191                 :            : 
     192         [ -  + ]:          1 :     if (!seed_str.empty()) {
     193 [ #  # ][ #  # ]:          0 :         if (!parse_signed(seed_str.c_str(), seed)) {
     194                 :          0 :             throw "seed must be an integer";
     195                 :            :         }
     196                 :            :     }
     197 [ +  - ][ +  - ]:          1 :     cout << "The random seed is " << seed << endl;
                 [ +  - ]
     198 [ +  - ][ +  - ]:          1 :     cout << "Please report the seed when reporting a test failure." << endl;
     199                 :            : 
     200                 :          1 :     string::size_type b = 0;
     201         [ +  + ]:         30 :     while (b != langs.size()) {
     202                 :         29 :         string::size_type a = b;
     203 [ +  + ][ +  - ]:        246 :         while (b < langs.size() && langs[b] != ' ') ++b;
         [ +  + ][ +  + ]
     204         [ +  - ]:         29 :         language.assign(langs, a, b - a);
     205 [ +  + ][ +  - ]:         57 :         while (b < langs.size() && langs[b] == ' ') ++b;
         [ +  + ][ +  + ]
     206 [ +  - ][ +  - ]:         29 :         cout << "Running tests with " << language << " stemmer..." << endl;
         [ +  - ][ +  - ]
     207 [ +  - ][ +  - ]:         29 :         stemmer = Xapian::Stem(language);
     208         [ +  - ]:         29 :         result = max(result, test_driver::run(tests));
     209                 :            :     }
     210                 :          1 :     return result;
     211         [ #  # ]:          0 : } catch (const char * e) {
     212   [ #  #  #  # ]:          0 :     cout << e << endl;
     213                 :          0 :     return 1;
     214 [ +  - ][ +  - ]:          4 : }

Generated by: LCOV version 1.11