ICU 50.1.2  50.1.2
bytestrie.h
Go to the documentation of this file.
1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: bytestrie.h
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2010sep25
12 * created by: Markus W. Scherer
13 */
14 
15 #ifndef __BYTESTRIE_H__
16 #define __BYTESTRIE_H__
17 
23 #include "unicode/utypes.h"
24 #include "unicode/stringpiece.h"
25 #include "unicode/uobject.h"
26 #include "unicode/ustringtrie.h"
27 
29 
30 class ByteSink;
31 class BytesTrieBuilder;
32 class CharString;
33 class UVector32;
34 
48 class U_COMMON_API BytesTrie : public UMemory {
49 public:
64  BytesTrie(const void *trieBytes)
65  : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
66  pos_(bytes_), remainingMatchLength_(-1) {}
67 
72  ~BytesTrie();
73 
80  BytesTrie(const BytesTrie &other)
81  : ownedArray_(NULL), bytes_(other.bytes_),
82  pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
83 
90  pos_=bytes_;
91  remainingMatchLength_=-1;
92  return *this;
93  }
94 
100  class State : public UMemory {
101  public:
106  State() { bytes=NULL; }
107  private:
108  friend class BytesTrie;
109 
110  const uint8_t *bytes;
111  const uint8_t *pos;
112  int32_t remainingMatchLength;
113  };
114 
122  const BytesTrie &saveState(State &state) const {
123  state.bytes=bytes_;
124  state.pos=pos_;
125  state.remainingMatchLength=remainingMatchLength_;
126  return *this;
127  }
128 
139  BytesTrie &resetToState(const State &state) {
140  if(bytes_==state.bytes && bytes_!=NULL) {
141  pos_=state.pos;
142  remainingMatchLength_=state.remainingMatchLength;
143  }
144  return *this;
145  }
146 
153  UStringTrieResult current() const;
154 
163  inline UStringTrieResult first(int32_t inByte) {
164  remainingMatchLength_=-1;
165  if(inByte<0) {
166  inByte+=0x100;
167  }
168  return nextImpl(bytes_, inByte);
169  }
170 
178  UStringTrieResult next(int32_t inByte);
179 
195  UStringTrieResult next(const char *s, int32_t length);
196 
206  inline int32_t getValue() const {
207  const uint8_t *pos=pos_;
208  int32_t leadByte=*pos++;
209  // U_ASSERT(leadByte>=kMinValueLead);
210  return readValue(pos, leadByte>>1);
211  }
212 
222  inline UBool hasUniqueValue(int32_t &uniqueValue) const {
223  const uint8_t *pos=pos_;
224  // Skip the rest of a pending linear-match node.
225  return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
226  }
227 
236  int32_t getNextBytes(ByteSink &out) const;
237 
242  class U_COMMON_API Iterator : public UMemory {
243  public:
255  Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
256 
268  Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
269 
274  ~Iterator();
275 
281  Iterator &reset();
282 
287  UBool hasNext() const;
288 
303  UBool next(UErrorCode &errorCode);
304 
309  const StringPiece &getString() const { return sp_; }
314  int32_t getValue() const { return value_; }
315 
316  private:
317  UBool truncateAndStop();
318 
319  const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
320 
321  const uint8_t *bytes_;
322  const uint8_t *pos_;
323  const uint8_t *initialPos_;
324  int32_t remainingMatchLength_;
325  int32_t initialRemainingMatchLength_;
326 
327  CharString *str_;
328  StringPiece sp_;
329  int32_t maxLength_;
330  int32_t value_;
331 
332  // The stack stores pairs of integers for backtracking to another
333  // outbound edge of a branch node.
334  // The first integer is an offset from bytes_.
335  // The second integer has the str_->length() from before the node in bits 15..0,
336  // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
337  // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
338  // but the code looks more confusing that way.)
339  UVector32 *stack_;
340  };
341 
342 private:
343  friend class BytesTrieBuilder;
344 
351  BytesTrie(void *adoptBytes, const void *trieBytes)
352  : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
353  bytes_(static_cast<const uint8_t *>(trieBytes)),
354  pos_(bytes_), remainingMatchLength_(-1) {}
355 
356  // No assignment operator.
357  BytesTrie &operator=(const BytesTrie &other);
358 
359  inline void stop() {
360  pos_=NULL;
361  }
362 
363  // Reads a compact 32-bit integer.
364  // pos is already after the leadByte, and the lead byte is already shifted right by 1.
365  static int32_t readValue(const uint8_t *pos, int32_t leadByte);
366  static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
367  // U_ASSERT(leadByte>=kMinValueLead);
368  if(leadByte>=(kMinTwoByteValueLead<<1)) {
369  if(leadByte<(kMinThreeByteValueLead<<1)) {
370  ++pos;
371  } else if(leadByte<(kFourByteValueLead<<1)) {
372  pos+=2;
373  } else {
374  pos+=3+((leadByte>>1)&1);
375  }
376  }
377  return pos;
378  }
379  static inline const uint8_t *skipValue(const uint8_t *pos) {
380  int32_t leadByte=*pos++;
381  return skipValue(pos, leadByte);
382  }
383 
384  // Reads a jump delta and jumps.
385  static const uint8_t *jumpByDelta(const uint8_t *pos);
386 
387  static inline const uint8_t *skipDelta(const uint8_t *pos) {
388  int32_t delta=*pos++;
389  if(delta>=kMinTwoByteDeltaLead) {
390  if(delta<kMinThreeByteDeltaLead) {
391  ++pos;
392  } else if(delta<kFourByteDeltaLead) {
393  pos+=2;
394  } else {
395  pos+=3+(delta&1);
396  }
397  }
398  return pos;
399  }
400 
401  static inline UStringTrieResult valueResult(int32_t node) {
402  return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
403  }
404 
405  // Handles a branch node for both next(byte) and next(string).
406  UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
407 
408  // Requires remainingLength_<0.
409  UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
410 
411  // Helper functions for hasUniqueValue().
412  // Recursively finds a unique value (or whether there is not a unique one)
413  // from a branch.
414  static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
415  UBool haveUniqueValue, int32_t &uniqueValue);
416  // Recursively finds a unique value (or whether there is not a unique one)
417  // starting from a position on a node lead byte.
418  static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
419 
420  // Helper functions for getNextBytes().
421  // getNextBytes() when pos is on a branch node.
422  static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
423  static void append(ByteSink &out, int c);
424 
425  // BytesTrie data structure
426  //
427  // The trie consists of a series of byte-serialized nodes for incremental
428  // string/byte sequence matching. The root node is at the beginning of the trie data.
429  //
430  // Types of nodes are distinguished by their node lead byte ranges.
431  // After each node, except a final-value node, another node follows to
432  // encode match values or continue matching further bytes.
433  //
434  // Node types:
435  // - Value node: Stores a 32-bit integer in a compact, variable-length format.
436  // The value is for the string/byte sequence so far.
437  // One node bit indicates whether the value is final or whether
438  // matching continues with the next node.
439  // - Linear-match node: Matches a number of bytes.
440  // - Branch node: Branches to other nodes according to the current input byte.
441  // The node byte is the length of the branch (number of bytes to select from)
442  // minus 1. It is followed by a sub-node:
443  // - If the length is at most kMaxBranchLinearSubNodeLength, then
444  // there are length-1 (key, value) pairs and then one more comparison byte.
445  // If one of the key bytes matches, then the value is either a final value for
446  // the string/byte sequence so far, or a "jump" delta to the next node.
447  // If the last byte matches, then matching continues with the next node.
448  // (Values have the same encoding as value nodes.)
449  // - If the length is greater than kMaxBranchLinearSubNodeLength, then
450  // there is one byte and one "jump" delta.
451  // If the input byte is less than the sub-node byte, then "jump" by delta to
452  // the next sub-node which will have a length of length/2.
453  // (The delta has its own compact encoding.)
454  // Otherwise, skip the "jump" delta to the next sub-node
455  // which will have a length of length-length/2.
456 
457  // Node lead byte values.
458 
459  // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
460  // the length is one more than the next byte.
461 
462  // For a branch sub-node with at most this many entries, we drop down
463  // to a linear search.
464  static const int32_t kMaxBranchLinearSubNodeLength=5;
465 
466  // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
467  static const int32_t kMinLinearMatch=0x10;
468  static const int32_t kMaxLinearMatchLength=0x10;
469 
470  // 20..ff: Variable-length value node.
471  // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
472  // Then shift-right by 1 bit.
473  // The remaining lead byte value indicates the number of following bytes (0..4)
474  // and contains the value's top bits.
475  static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20
476  // It is a final value if bit 0 is set.
477  static const int32_t kValueIsFinal=1;
478 
479  // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
480  static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10
481  static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte.
482 
483  static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51
484  static const int32_t kMaxTwoByteValue=0x1aff;
485 
486  static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c
487  static const int32_t kFourByteValueLead=0x7e;
488 
489  // A little more than Unicode code points. (0x11ffff)
490  static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
491 
492  static const int32_t kFiveByteValueLead=0x7f;
493 
494  // Compact delta integers.
495  static const int32_t kMaxOneByteDelta=0xbf;
496  static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0
497  static const int32_t kMinThreeByteDeltaLead=0xf0;
498  static const int32_t kFourByteDeltaLead=0xfe;
499  static const int32_t kFiveByteDeltaLead=0xff;
500 
501  static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff
502  static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff
503 
504  uint8_t *ownedArray_;
505 
506  // Fixed value referencing the BytesTrie bytes.
507  const uint8_t *bytes_;
508 
509  // Iterator variables.
510 
511  // Pointer to next trie byte to read. NULL if no more matches.
512  const uint8_t *pos_;
513  // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
514  int32_t remainingMatchLength_;
515 };
516 
518 
519 #endif // __BYTESTRIE_H__
Builder class for BytesTrie.
const BytesTrie & saveState(State &state) const
Saves the state of this trie.
Definition: bytestrie.h:122
BytesTrie state object, for saving a trie&#39;s current state and resetting the trie back to this state l...
Definition: bytestrie.h:100
UStringTrieResult
Return values for BytesTrie::next(), UCharsTrie::next() and similar methods.
Definition: ustringtrie.h:33
BytesTrie(const void *trieBytes)
Constructs a BytesTrie reader instance.
Definition: bytestrie.h:64
const StringPiece & getString() const
Definition: bytestrie.h:309
A ByteSink can be filled with bytes.
Definition: bytestream.h:48
int32_t getValue() const
Definition: bytestrie.h:314
int32_t getValue() const
Returns a matching byte sequence&#39;s value if called immediately after current()/first()/next() returne...
Definition: bytestrie.h:206
Light-weight, non-const reader class for a BytesTrie.
Definition: bytestrie.h:48
BytesTrie & reset()
Resets this trie to its initial state.
Definition: bytestrie.h:89
C++ API: StringPiece: Read-only byte string wrapper class.
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129
UStringTrieResult first(int32_t inByte)
Traverses the trie from the initial state for this input byte.
Definition: bytestrie.h:163
UBool hasUniqueValue(int32_t &uniqueValue) const
Determines whether all byte sequences reachable from the current state map to the same value...
Definition: bytestrie.h:222
BytesTrie(const BytesTrie &other)
Copy constructor, copies the other trie reader object and its state, but not the byte array which wil...
Definition: bytestrie.h:80
#define NULL
Define NULL if necessary, to 0 for C++ and to ((void *)0) for C.
Definition: utypes.h:186
State()
Constructs an empty State.
Definition: bytestrie.h:106
Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
Definition: bytestrie.h:242
C++ API: Common ICU base class UObject.
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:476
C API: Helper definitions for dictionary trie APIs.
Basic definitions for ICU, for both C and C++ APIs.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:208
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:357
The input unit(s) continued a matching string and there is a value for the string so far...
Definition: ustringtrie.h:64
A string-like object that points to a sized piece of memory.
Definition: stringpiece.h:52
BytesTrie & resetToState(const State &state)
Resets this trie to the saved state.
Definition: bytestrie.h:139
UMemory is the common ICU base class.
Definition: uobject.h:115
int8_t UBool
The ICU boolean type.
Definition: umachine.h:200