ICU 50.1.2  50.1.2
stringtriebuilder.h
Go to the documentation of this file.
1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: stringtriebuilder.h
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2010dec24
12 * created by: Markus W. Scherer
13 */
14 
15 #ifndef __STRINGTRIEBUILDER_H__
16 #define __STRINGTRIEBUILDER_H__
17 
18 #include "unicode/utypes.h"
19 #include "unicode/uobject.h"
20 
26 // Forward declaration.
27 struct UHashtable;
28 typedef struct UHashtable UHashtable;
29 
51 };
52 
54 
62 public:
63 #ifndef U_HIDE_INTERNAL_API
64 
65  static UBool hashNode(const void *node);
67  static UBool equalNodes(const void *left, const void *right);
68 #endif /* U_HIDE_INTERNAL_API */
69 
70 protected:
71  // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
72  // or else the compiler will create a public default constructor.
76  virtual ~StringTrieBuilder();
77 
78 #ifndef U_HIDE_INTERNAL_API
79 
80  void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
82  void deleteCompactBuilder();
83 
85  void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
86 
88  int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
90  int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
91 #endif /* U_HIDE_INTERNAL_API */
92 
93  class Node;
94 
95 #ifndef U_HIDE_INTERNAL_API
96 
97  Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
99  Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
100  int32_t length, UErrorCode &errorCode);
101 #endif /* U_HIDE_INTERNAL_API */
102 
104  virtual int32_t getElementStringLength(int32_t i) const = 0;
106  virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
108  virtual int32_t getElementValue(int32_t i) const = 0;
109 
110  // Finds the first unit index after this one where
111  // the first and last element have different units again.
113  virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
114 
115  // Number of different units at unitIndex.
117  virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
119  virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
121  virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
122 
124  virtual UBool matchNodesCanHaveValues() const = 0;
125 
127  virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
129  virtual int32_t getMinLinearMatch() const = 0;
131  virtual int32_t getMaxLinearMatchLength() const = 0;
132 
133 #ifndef U_HIDE_INTERNAL_API
134  // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
136  static const int32_t kMaxBranchLinearSubNodeLength=5;
137 
138  // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
139  // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
141  static const int32_t kMaxSplitBranchLevels=14;
142 
153  Node *registerNode(Node *newNode, UErrorCode &errorCode);
164  Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
165 
166  /*
167  * C++ note:
168  * registerNode() and registerFinalValue() take ownership of their input nodes,
169  * and only return owned nodes.
170  * If they see a failure UErrorCode, they will delete the input node.
171  * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
172  * If there is a failure, they return NULL.
173  *
174  * NULL Node pointers can be safely passed into other Nodes because
175  * they call the static Node::hashCode() which checks for a NULL pointer first.
176  *
177  * Therefore, as long as builder functions register a new node,
178  * they need to check for failures only before explicitly dereferencing
179  * a Node pointer, or before setting a new UErrorCode.
180  */
181 
182  // Hash set of nodes, maps from nodes to integer 1.
184  UHashtable *nodes;
185 
187  class Node : public UObject {
188  public:
189  Node(int32_t initialHash) : hash(initialHash), offset(0) {}
190  inline int32_t hashCode() const { return hash; }
191  // Handles node==NULL.
192  static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
193  // Base class operator==() compares the actual class types.
194  virtual UBool operator==(const Node &other) const;
195  inline UBool operator!=(const Node &other) const { return !operator==(other); }
223  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
224  // write() must set the offset to a positive value.
225  virtual void write(StringTrieBuilder &builder) = 0;
226  // See markRightEdgesFirst.
227  inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
228  StringTrieBuilder &builder) {
229  // Note: Edge numbers are negative, lastRight<=firstRight.
230  // If offset>0 then this node and its sub-nodes have been written already
231  // and we need not write them again.
232  // If this node is part of the unwritten right branch edge,
233  // then we wait until that is written.
234  if(offset<0 && (offset<lastRight || firstRight<offset)) {
235  write(builder);
236  }
237  }
238  inline int32_t getOffset() const { return offset; }
239  protected:
240  int32_t hash;
241  int32_t offset;
242  private:
243  // No ICU "poor man's RTTI" for this class nor its subclasses.
244  virtual UClassID getDynamicClassID() const;
245  };
246 
247  // This class should not be overridden because
248  // registerFinalValue() compares a stack-allocated FinalValueNode
249  // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
250  // with the input node, and the
251  // !Node::operator==(other) used inside FinalValueNode::operator==(other)
252  // will be false if the typeid's are different.
254  class FinalValueNode : public Node {
255  public:
256  FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
257  virtual UBool operator==(const Node &other) const;
258  virtual void write(StringTrieBuilder &builder);
259  protected:
260  int32_t value;
261  };
262 
266  class ValueNode : public Node {
267  public:
268  ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
269  virtual UBool operator==(const Node &other) const;
270  void setValue(int32_t v) {
271  hasValue=TRUE;
272  value=v;
273  hash=hash*37+v;
274  }
275  protected:
276  UBool hasValue;
277  int32_t value;
278  };
279 
284  public:
285  IntermediateValueNode(int32_t v, Node *nextNode)
286  : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
287  virtual UBool operator==(const Node &other) const;
288  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
289  virtual void write(StringTrieBuilder &builder);
290  protected:
291  Node *next;
292  };
293 
297  class LinearMatchNode : public ValueNode {
298  public:
299  LinearMatchNode(int32_t len, Node *nextNode)
300  : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
301  length(len), next(nextNode) {}
302  virtual UBool operator==(const Node &other) const;
303  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
304  protected:
305  int32_t length;
306  Node *next;
307  };
308 
312  class BranchNode : public Node {
313  public:
314  BranchNode(int32_t initialHash) : Node(initialHash) {}
315  protected:
316  int32_t firstEdgeNumber;
317  };
318 
322  class ListBranchNode : public BranchNode {
323  public:
324  ListBranchNode() : BranchNode(0x444444), length(0) {}
325  virtual UBool operator==(const Node &other) const;
326  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
327  virtual void write(StringTrieBuilder &builder);
328  // Adds a unit with a final value.
329  void add(int32_t c, int32_t value) {
330  units[length]=(UChar)c;
331  equal[length]=NULL;
332  values[length]=value;
333  ++length;
334  hash=(hash*37+c)*37+value;
335  }
336  // Adds a unit which leads to another match node.
337  void add(int32_t c, Node *node) {
338  units[length]=(UChar)c;
339  equal[length]=node;
340  values[length]=0;
341  ++length;
342  hash=(hash*37+c)*37+hashCode(node);
343  }
344  protected:
345  Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
346  int32_t length;
347  int32_t values[kMaxBranchLinearSubNodeLength];
348  UChar units[kMaxBranchLinearSubNodeLength];
349  };
350 
354  class SplitBranchNode : public BranchNode {
355  public:
356  SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
357  : BranchNode(((0x555555*37+middleUnit)*37+
358  hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
359  unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
360  virtual UBool operator==(const Node &other) const;
361  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
362  virtual void write(StringTrieBuilder &builder);
363  protected:
364  UChar unit;
365  Node *lessThan;
366  Node *greaterOrEqual;
367  };
368 
369  // Branch head node, for writing the actual node lead unit.
371  class BranchHeadNode : public ValueNode {
372  public:
373  BranchHeadNode(int32_t len, Node *subNode)
374  : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
375  length(len), next(subNode) {}
376  virtual UBool operator==(const Node &other) const;
377  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
378  virtual void write(StringTrieBuilder &builder);
379  protected:
380  int32_t length;
381  Node *next; // A branch sub-node.
382  };
383 #endif /* U_HIDE_INTERNAL_API */
384 
386  virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
387  Node *nextNode) const = 0;
388 
390  virtual int32_t write(int32_t unit) = 0;
392  virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
394  virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
396  virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
398  virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
399 
400 private:
401  // No ICU "poor man's RTTI" for this class nor its subclasses.
402  virtual UClassID getDynamicClassID() const;
403 };
404 
406 
407 #endif // __STRINGTRIEBUILDER_H__
Builds a trie more slowly, attempting to generate a shorter but equivalent serialization.
Base class for string trie builder classes.
virtual UClassID getDynamicClassID() const =0
ICU4C &quot;poor man&#39;s RTTI&quot;, returns a UClassID for the actual ICU class.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler&#39;s RTTI.
Definition: uobject.h:96
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:218
Builds a trie quickly.
#define NULL
Define NULL if necessary, to 0 for C++ and to ((void *)0) for C.
Definition: utypes.h:186
#define TRUE
The TRUE value of a UBool.
Definition: umachine.h:204
C++ API: Common ICU base class UObject.
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
Definition: umachine.h:278
UStringTrieBuildOption
Build options for BytesTrieBuilder and CharsTrieBuilder.
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:476
Basic definitions for ICU, for both C and C++ APIs.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:208
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:357
UObject is the common ICU &quot;boilerplate&quot; class.
Definition: uobject.h:229
int8_t UBool
The ICU boolean type.
Definition: umachine.h:200