LCOV - code coverage report
Current view: top level - common - bitstream.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core c2b6f1024d3a Lines: 78 78 100.0 %
Date: 2019-05-16 09:13:18 Functions: 8 8 100.0 %
Branches: 34 34 100.0 %

           Branch data     Line data    Source code
       1                 :            : /** @file bitstream.cc
       2                 :            :  * @brief Classes to encode/decode a bitstream.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2004,2005,2006,2008,2013,2014,2016,2017,2018 Olly Betts
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or
       7                 :            :  * modify it under the terms of the GNU General Public License as
       8                 :            :  * published by the Free Software Foundation; either version 2 of the
       9                 :            :  * License, or (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful,
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
      19                 :            :  * USA
      20                 :            :  */
      21                 :            : 
      22                 :            : #include <config.h>
      23                 :            : 
      24                 :            : #include "bitstream.h"
      25                 :            : 
      26                 :            : #include <xapian/types.h>
      27                 :            : 
      28                 :            : #include "omassert.h"
      29                 :            : #include "pack.h"
      30                 :            : 
      31                 :            : #include <cmath>
      32                 :            : #include <vector>
      33                 :            : 
      34                 :            : using namespace std;
      35                 :            : 
      36                 :            : // Highly optimised fls() implementation.
      37                 :            : template<typename T>
      38                 :            : static inline int
      39                 :     882861 : highest_order_bit(T mask)
      40                 :            : {
      41                 :            : #ifdef HAVE_DO_CLZ
      42         [ +  + ]:     882861 :     return mask ? sizeof(T) * 8 - do_clz(mask) : 0;
      43                 :            : #else
      44                 :            :     static const unsigned char flstab[256] = {
      45                 :            :         0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
      46                 :            :         5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
      47                 :            :         6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
      48                 :            :         6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
      49                 :            :         7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
      50                 :            :         7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
      51                 :            :         7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
      52                 :            :         7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
      53                 :            :         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
      54                 :            :         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
      55                 :            :         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
      56                 :            :         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
      57                 :            :         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
      58                 :            :         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
      59                 :            :         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
      60                 :            :         8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
      61                 :            :     };
      62                 :            : 
      63                 :            :     int result = 0;
      64                 :            :     if (sizeof(T) > 4) {
      65                 :            :         if (mask >= 0x100000000ul) {
      66                 :            :             mask >>= 32;
      67                 :            :             result += 32;
      68                 :            :         }
      69                 :            :     }
      70                 :            :     if (mask >= 0x10000u) {
      71                 :            :         mask >>= 16;
      72                 :            :         result += 16;
      73                 :            :     }
      74                 :            :     if (mask >= 0x100u) {
      75                 :            :         mask >>= 8;
      76                 :            :         result += 8;
      77                 :            :     }
      78                 :            :     return result + flstab[mask];
      79                 :            : #endif
      80                 :            : }
      81                 :            : 
      82                 :            : namespace Xapian {
      83                 :            : 
      84                 :            : /// Shift left that's safe for shifts wider than the type.
      85                 :            : template<typename T, typename U>
      86                 :            : static constexpr inline
      87                 :     879964 : T safe_shl(T x, U shift)
      88                 :            : {
      89         [ +  + ]:     879964 :     return (shift >= sizeof(T) * 8 ? 0 : x << shift);
      90                 :            : }
      91                 :            : 
      92                 :            : void
      93                 :     837865 : BitWriter::encode(Xapian::termpos value, Xapian::termpos outof)
      94                 :            : {
      95                 :            :     Assert(value < outof);
      96                 :     837865 :     unsigned bits = highest_order_bit(outof - Xapian::termpos(1));
      97                 :     837865 :     const Xapian::termpos spare = safe_shl(Xapian::termpos(1), bits) - outof;
      98         [ +  + ]:     837865 :     if (spare) {
      99                 :            :         /* If we have spare values, we can use one fewer bit to encode some
     100                 :            :          * values.  We shorten the values in the middle of the range, as
     101                 :            :          * testing (on positional data) shows this works best.  "Managing
     102                 :            :          * Gigabytes" suggests reversing this for the lowest level and encoding
     103                 :            :          * the end values of the range shorter, which is contrary to our
     104                 :            :          * testing (MG is talking about posting lists, which probably have
     105                 :            :          * different characteristics).
     106                 :            :          *
     107                 :            :          * For example, if outof is 11, the codes emitted are:
     108                 :            :          *
     109                 :            :          * value        output
     110                 :            :          * 0            0000
     111                 :            :          * 1            0001
     112                 :            :          * 2            0010
     113                 :            :          * 3             011
     114                 :            :          * 4             100
     115                 :            :          * 5             101
     116                 :            :          * 6             110
     117                 :            :          * 7             111
     118                 :            :          * 8            1000
     119                 :            :          * 9            1001
     120                 :            :          * 10           1010
     121                 :            :          *
     122                 :            :          * Note the LSB comes first in the bitstream, so these codes need to be
     123                 :            :          * suffix-free to be decoded.
     124                 :            :          */
     125                 :     660382 :         const Xapian::termpos mid_start = (outof - spare) / 2;
     126         [ +  + ]:     660382 :         if (value >= mid_start + spare) {
     127                 :      94845 :             value = (value - (mid_start + spare)) |
     128                 :      94845 :                     (Xapian::termpos(1) << (bits - 1));
     129         [ +  + ]:     565537 :         } else if (value >= mid_start) {
     130                 :     660382 :             --bits;
     131                 :            :         }
     132                 :            :     }
     133                 :            : 
     134         [ +  + ]:     837865 :     if (bits + n_bits > sizeof(acc) * 8) {
     135                 :            :         // We need to write more bits than there's empty room for in
     136                 :            :         // the accumulator.  So we arrange to shift out 8 bits, then
     137                 :            :         // adjust things so we're adding 8 fewer bits.
     138                 :            :         Assert(bits <= sizeof(acc) * 8);
     139                 :        100 :         acc |= (value << n_bits);
     140                 :        100 :         buf += char(acc);
     141                 :        100 :         acc >>= 8;
     142                 :        100 :         value >>= 8;
     143                 :        100 :         bits -= 8;
     144                 :            :     }
     145                 :     837865 :     acc |= (value << n_bits);
     146                 :     837865 :     n_bits += bits;
     147         [ +  + ]:    1294525 :     while (n_bits >= 8) {
     148                 :     456660 :         buf += char(acc);
     149                 :     456660 :         acc >>= 8;
     150                 :     456660 :         n_bits -= 8;
     151                 :            :     }
     152                 :     837865 : }
     153                 :            : 
     154                 :            : void
     155                 :     627363 : BitWriter::encode_interpolative(const Xapian::VecCOW<Xapian::termpos> &pos, int j, int k)
     156                 :            : {
     157                 :            :     // "Interpolative code" - for an algorithm description, see "Managing
     158                 :            :     // Gigabytes" - pages 126-127 in the second edition.  You can probably
     159                 :            :     // view those pages in google books.
     160         [ +  + ]:    1044224 :     while (j + 1 < k) {
     161                 :     416861 :         const Xapian::termpos mid = j + (k - j) / 2;
     162                 :            :         // Encode one out of (pos[k] - pos[j] + 1) values
     163                 :            :         // (less some at either end because we must be able to fit
     164                 :            :         // all the intervening pos in)
     165                 :     416861 :         const Xapian::termpos outof = pos[k] - pos[j] + j - k + 1;
     166                 :     416861 :         const Xapian::termpos lowest = pos[j] + mid - j;
     167                 :     416861 :         encode(pos[mid] - lowest, outof);
     168                 :     416861 :         encode_interpolative(pos, j, mid);
     169                 :     416861 :         j = mid;
     170                 :            :     }
     171                 :     627363 : }
     172                 :            : 
     173                 :            : Xapian::termpos
     174                 :      42099 : BitReader::decode(Xapian::termpos outof, bool force)
     175                 :            : {
     176                 :            :     (void)force;
     177                 :            :     Assert(force == di_current.is_initialized());
     178                 :      42099 :     Xapian::termpos bits = highest_order_bit(outof - Xapian::termpos(1));
     179                 :      42099 :     const Xapian::termpos spare = safe_shl(Xapian::termpos(1), bits) - outof;
     180                 :      42099 :     const Xapian::termpos mid_start = (outof - spare) / 2;
     181                 :            :     Xapian::termpos pos;
     182         [ +  + ]:      42099 :     if (spare) {
     183                 :      40234 :         pos = read_bits(bits - 1);
     184         [ +  + ]:      40234 :         if (pos < mid_start) {
     185         [ +  + ]:       7120 :             if (read_bits(1)) pos += mid_start + spare;
     186                 :            :         }
     187                 :            :     } else {
     188                 :       1865 :         pos = read_bits(bits);
     189                 :            :     }
     190                 :            :     Assert(pos < outof);
     191                 :      42099 :     return pos;
     192                 :            : }
     193                 :            : 
     194                 :            : Xapian::termpos
     195                 :      49939 : BitReader::read_bits(int count)
     196                 :            : {
     197                 :            :     Xapian::termpos result;
     198         [ +  + ]:      49939 :     if (count > int(sizeof(acc) * 8 - 7)) {
     199                 :            :         // If we need more than 7 bits less than fit in acc do the read in two
     200                 :            :         // goes to ensure that we don't overflow acc.  This is a little more
     201                 :            :         // conservative than it needs to be, but such large values will
     202                 :            :         // inevitably be rare (because you can't fit very many of them into
     203                 :            :         // the full Xapian::termpos range).
     204                 :            :         Assert(count <= int(sizeof(acc) * 8));
     205                 :        360 :         const size_t half_the_bits = sizeof(acc) * 4;
     206                 :        360 :         result = read_bits(half_the_bits);
     207                 :        360 :         return result | (read_bits(count - half_the_bits) << half_the_bits);
     208                 :            :     }
     209         [ +  + ]:      96420 :     while (n_bits < count) {
     210                 :            :         Assert(p < end);
     211                 :      46841 :         acc |= Xapian::termpos(static_cast<unsigned char>(*p++)) << n_bits;
     212                 :      46841 :         n_bits += 8;
     213                 :            :     }
     214                 :      49579 :     result = acc & ((Xapian::termpos(1) << count) - Xapian::termpos(1));
     215                 :      49579 :     acc >>= count;
     216                 :      49579 :     n_bits -= count;
     217                 :      49579 :     return result;
     218                 :            : }
     219                 :            : 
     220                 :            : void
     221                 :       2897 : BitReader::decode_interpolative(int j, int k,
     222                 :            :                                 Xapian::termpos pos_j, Xapian::termpos pos_k)
     223                 :            : {
     224                 :            :     Assert(!di_current.is_initialized());
     225                 :       2897 :     di_stack.reserve(highest_order_bit(pos_k - pos_j));
     226                 :       2897 :     di_current.set_j(j, pos_j);
     227                 :       2897 :     di_current.set_k(k, pos_k);
     228                 :       2897 : }
     229                 :            : 
     230                 :            : Xapian::termpos
     231                 :      37308 : BitReader::decode_interpolative_next()
     232                 :            : {
     233                 :            :     Assert(di_current.is_initialized());
     234 [ +  + ][ +  + ]:      72127 :     while (!di_stack.empty() || di_current.is_next()) {
                 [ +  + ]
     235         [ +  + ]:      69608 :         if (!di_current.is_next()) {
     236                 :      34789 :             Xapian::termpos pos_ret = di_current.pos_k;
     237                 :      34789 :             di_current = di_stack.back();
     238                 :      34789 :             di_stack.pop_back();
     239                 :      34789 :             int mid = (di_current.j + di_current.k) / 2;
     240                 :      34789 :             di_current.set_j(mid, pos_ret);
     241                 :      34789 :             return pos_ret;
     242                 :            :         }
     243                 :      34819 :         di_stack.push_back(di_current);
     244                 :      34819 :         int mid = (di_current.j + di_current.k) / 2;
     245                 :      34819 :         Xapian::termpos pos_mid = decode(di_current.outof(), true) +
     246                 :      34819 :                                   (di_current.pos_j + mid - di_current.j);
     247                 :      34819 :         di_current.set_k(mid, pos_mid);
     248                 :            :     }
     249                 :            : #ifdef XAPIAN_ASSERTIONS
     250                 :            :     di_current.uninit();
     251                 :            : #endif
     252                 :       2519 :     return di_current.pos_k;
     253                 :            : }
     254                 :            : 
     255                 :            : }

Generated by: LCOV version 1.11