ICU 50.1.2  50.1.2
uniset.h
Go to the documentation of this file.
1 /*
2 ***************************************************************************
3 * Copyright (C) 1999-2011, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ***************************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 ***************************************************************************
9 */
10 
11 #ifndef UNICODESET_H
12 #define UNICODESET_H
13 
14 #include "unicode/unifilt.h"
15 #include "unicode/unistr.h"
16 #include "unicode/uset.h"
17 
24 
25 class BMPSet;
26 class ParsePosition;
27 class RBBIRuleScanner;
28 class SymbolTable;
29 class UnicodeSetStringSpan;
30 class UVector;
31 class RuleCharacterIterator;
32 
274 
275  int32_t len; // length of list used; 0 <= len <= capacity
276  int32_t capacity; // capacity of list
277  UChar32* list; // MUST be terminated with HIGH
278  BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
279  UChar32* buffer; // internal buffer, may be NULL
280  int32_t bufferCapacity; // capacity of buffer
281  int32_t patLen;
282 
292  UChar *pat;
293  UVector* strings; // maintained in sorted order
294  UnicodeSetStringSpan *stringSpan;
295 
296 private:
297  enum { // constants
298  kIsBogus = 1 // This set is bogus (i.e. not valid)
299  };
300  uint8_t fFlags; // Bit flag (see constants above)
301 public:
311  inline UBool isBogus(void) const;
312 
329  void setToBogus();
330 
331 public:
332 
333  enum {
338  MIN_VALUE = 0,
339 
344  MAX_VALUE = 0x10ffff
345  };
346 
347  //----------------------------------------------------------------
348  // Constructors &c
349  //----------------------------------------------------------------
350 
351 public:
352 
357  UnicodeSet();
358 
367  UnicodeSet(UChar32 start, UChar32 end);
368 
377  UnicodeSet(const UnicodeString& pattern,
378  UErrorCode& status);
379 
380 #ifndef U_HIDE_INTERNAL_API
381 
393  UnicodeSet(const UnicodeString& pattern,
394  uint32_t options,
395  const SymbolTable* symbols,
396  UErrorCode& status);
397 #endif /* U_HIDE_INTERNAL_API */
398 
412  UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
413  uint32_t options,
414  const SymbolTable* symbols,
415  UErrorCode& status);
416 
421  UnicodeSet(const UnicodeSet& o);
422 
427  virtual ~UnicodeSet();
428 
434  UnicodeSet& operator=(const UnicodeSet& o);
435 
447  virtual UBool operator==(const UnicodeSet& o) const;
448 
454  UBool operator!=(const UnicodeSet& o) const;
455 
465  virtual UnicodeFunctor* clone() const;
466 
474  virtual int32_t hashCode(void) const;
475 
484  inline static UnicodeSet *fromUSet(USet *uset);
485 
494  inline static const UnicodeSet *fromUSet(const USet *uset);
495 
503  inline USet *toUSet();
504 
505 
513  inline const USet * toUSet() const;
514 
515 
516  //----------------------------------------------------------------
517  // Freezable API
518  //----------------------------------------------------------------
519 
528  inline UBool isFrozen() const;
529 
543  UnicodeFunctor *freeze();
544 
553  UnicodeFunctor *cloneAsThawed() const;
554 
555  //----------------------------------------------------------------
556  // Public API
557  //----------------------------------------------------------------
558 
569  UnicodeSet& set(UChar32 start, UChar32 end);
570 
576  static UBool resemblesPattern(const UnicodeString& pattern,
577  int32_t pos);
578 
591  UnicodeSet& applyPattern(const UnicodeString& pattern,
592  UErrorCode& status);
593 
594 #ifndef U_HIDE_INTERNAL_API
595 
611  UnicodeSet& applyPattern(const UnicodeString& pattern,
612  uint32_t options,
613  const SymbolTable* symbols,
614  UErrorCode& status);
615 #endif /* U_HIDE_INTERNAL_API */
616 
648  UnicodeSet& applyPattern(const UnicodeString& pattern,
649  ParsePosition& pos,
650  uint32_t options,
651  const SymbolTable* symbols,
652  UErrorCode& status);
653 
667  virtual UnicodeString& toPattern(UnicodeString& result,
668  UBool escapeUnprintable = FALSE) const;
669 
692  UnicodeSet& applyIntPropertyValue(UProperty prop,
693  int32_t value,
694  UErrorCode& ec);
695 
725  UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
726  const UnicodeString& value,
727  UErrorCode& ec);
728 
737  virtual int32_t size(void) const;
738 
745  virtual UBool isEmpty(void) const;
746 
754  virtual UBool contains(UChar32 c) const;
755 
764  virtual UBool contains(UChar32 start, UChar32 end) const;
765 
773  UBool contains(const UnicodeString& s) const;
774 
782  virtual UBool containsAll(const UnicodeSet& c) const;
783 
791  UBool containsAll(const UnicodeString& s) const;
792 
801  UBool containsNone(UChar32 start, UChar32 end) const;
802 
810  UBool containsNone(const UnicodeSet& c) const;
811 
819  UBool containsNone(const UnicodeString& s) const;
820 
829  inline UBool containsSome(UChar32 start, UChar32 end) const;
830 
838  inline UBool containsSome(const UnicodeSet& s) const;
839 
847  inline UBool containsSome(const UnicodeString& s) const;
848 
867  int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
868 
881  inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
882 
900  int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
901 
915  inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
916 
935  int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
936 
954  int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
955 
960  virtual UMatchDegree matches(const Replaceable& text,
961  int32_t& offset,
962  int32_t limit,
963  UBool incremental);
964 
965 private:
988  static int32_t matchRest(const Replaceable& text,
989  int32_t start, int32_t limit,
990  const UnicodeString& s);
991 
1001  int32_t findCodePoint(UChar32 c) const;
1002 
1003 public:
1004 
1012  virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
1013 
1022  int32_t indexOf(UChar32 c) const;
1023 
1033  UChar32 charAt(int32_t index) const;
1034 
1049  virtual UnicodeSet& add(UChar32 start, UChar32 end);
1050 
1058  UnicodeSet& add(UChar32 c);
1059 
1071  UnicodeSet& add(const UnicodeString& s);
1072 
1073  private:
1079  static int32_t getSingleCP(const UnicodeString& s);
1080 
1081  void _add(const UnicodeString& s);
1082 
1083  public:
1092  UnicodeSet& addAll(const UnicodeString& s);
1093 
1102  UnicodeSet& retainAll(const UnicodeString& s);
1103 
1112  UnicodeSet& complementAll(const UnicodeString& s);
1113 
1122  UnicodeSet& removeAll(const UnicodeString& s);
1123 
1132  static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1133 
1134 
1142  static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1143 
1157  virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1158 
1159 
1165  UnicodeSet& retain(UChar32 c);
1166 
1180  virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1181 
1189  UnicodeSet& remove(UChar32 c);
1190 
1200  UnicodeSet& remove(const UnicodeString& s);
1201 
1209  virtual UnicodeSet& complement(void);
1210 
1225  virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1226 
1234  UnicodeSet& complement(UChar32 c);
1235 
1246  UnicodeSet& complement(const UnicodeString& s);
1247 
1260  virtual UnicodeSet& addAll(const UnicodeSet& c);
1261 
1273  virtual UnicodeSet& retainAll(const UnicodeSet& c);
1274 
1286  virtual UnicodeSet& removeAll(const UnicodeSet& c);
1287 
1298  virtual UnicodeSet& complementAll(const UnicodeSet& c);
1299 
1306  virtual UnicodeSet& clear(void);
1307 
1333  UnicodeSet& closeOver(int32_t attribute);
1334 
1341  virtual UnicodeSet &removeAllStrings();
1342 
1350  virtual int32_t getRangeCount(void) const;
1351 
1359  virtual UChar32 getRangeStart(int32_t index) const;
1360 
1368  virtual UChar32 getRangeEnd(int32_t index) const;
1369 
1418  int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1419 
1426  virtual UnicodeSet& compact();
1427 
1439  static UClassID U_EXPORT2 getStaticClassID(void);
1440 
1449  virtual UClassID getDynamicClassID(void) const;
1450 
1451 private:
1452 
1453  // Private API for the USet API
1454 
1455  friend class USetAccess;
1456 
1457  int32_t getStringCount() const;
1458 
1459  const UnicodeString* getString(int32_t index) const;
1460 
1461  //----------------------------------------------------------------
1462  // RuleBasedTransliterator support
1463  //----------------------------------------------------------------
1464 
1465 private:
1466 
1472  virtual UBool matchesIndexValue(uint8_t v) const;
1473 
1474 private:
1475  friend class RBBIRuleScanner;
1476 
1477  //----------------------------------------------------------------
1478  // Implementation: Clone as thawed (see ICU4J Freezable)
1479  //----------------------------------------------------------------
1480 
1481  UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1482 
1483  //----------------------------------------------------------------
1484  // Implementation: Pattern parsing
1485  //----------------------------------------------------------------
1486 
1487  void applyPatternIgnoreSpace(const UnicodeString& pattern,
1488  ParsePosition& pos,
1489  const SymbolTable* symbols,
1490  UErrorCode& status);
1491 
1492  void applyPattern(RuleCharacterIterator& chars,
1493  const SymbolTable* symbols,
1494  UnicodeString& rebuiltPat,
1495  uint32_t options,
1496  UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1497  UErrorCode& ec);
1498 
1499  //----------------------------------------------------------------
1500  // Implementation: Utility methods
1501  //----------------------------------------------------------------
1502 
1503  void ensureCapacity(int32_t newLen, UErrorCode& ec);
1504 
1505  void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
1506 
1507  void swapBuffers(void);
1508 
1509  UBool allocateStrings(UErrorCode &status);
1510 
1511  UnicodeString& _toPattern(UnicodeString& result,
1512  UBool escapeUnprintable) const;
1513 
1514  UnicodeString& _generatePattern(UnicodeString& result,
1515  UBool escapeUnprintable) const;
1516 
1517  static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1518 
1519  static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1520 
1521  //----------------------------------------------------------------
1522  // Implementation: Fundamental operators
1523  //----------------------------------------------------------------
1524 
1525  void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1526 
1527  void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1528 
1529  void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1530 
1536  static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1537  int32_t pos);
1538 
1539  static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1540  int32_t iterOpts);
1541 
1581  UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1582  ParsePosition& ppos,
1583  UErrorCode &ec);
1584 
1585  void applyPropertyPattern(RuleCharacterIterator& chars,
1586  UnicodeString& rebuiltPat,
1587  UErrorCode& ec);
1588 
1589  static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
1590 
1595  typedef UBool (*Filter)(UChar32 codePoint, void* context);
1596 
1606  void applyFilter(Filter filter,
1607  void* context,
1608  int32_t src,
1609  UErrorCode &status);
1610 
1614  void setPattern(const UnicodeString& newPat);
1618  void releasePattern();
1619 
1620  friend class UnicodeSetIterator;
1621 };
1622 
1623 
1624 
1625 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
1626  return !operator==(o);
1627 }
1628 
1629 inline UBool UnicodeSet::isFrozen() const {
1630  return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
1631 }
1632 
1633 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1634  return !containsNone(start, end);
1635 }
1636 
1637 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1638  return !containsNone(s);
1639 }
1640 
1641 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1642  return !containsNone(s);
1643 }
1644 
1645 inline UBool UnicodeSet::isBogus() const {
1646  return (UBool)(fFlags & kIsBogus);
1647 }
1648 
1649 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
1650  return reinterpret_cast<UnicodeSet *>(uset);
1651 }
1652 
1653 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1654  return reinterpret_cast<const UnicodeSet *>(uset);
1655 }
1656 
1657 inline USet *UnicodeSet::toUSet() {
1658  return reinterpret_cast<USet *>(this);
1659 }
1660 
1661 inline const USet *UnicodeSet::toUSet() const {
1662  return reinterpret_cast<const USet *>(this);
1663 }
1664 
1665 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1666  int32_t sLength=s.length();
1667  if(start<0) {
1668  start=0;
1669  } else if(start>sLength) {
1670  start=sLength;
1671  }
1672  return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1673 }
1674 
1675 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1676  int32_t sLength=s.length();
1677  if(limit<0) {
1678  limit=0;
1679  } else if(limit>sLength) {
1680  limit=sLength;
1681  }
1682  return spanBack(s.getBuffer(), limit, spanCondition);
1683 }
1684 
1686 
1687 #endif
static UClassID getStaticClassID()
ICU &quot;poor man&#39;s RTTI&quot;, returns a UClassID for this class.
UMatchDegree
Constants returned by UnicodeMatcher::matches() indicating the degree of match.
Definition: unimatch.h:30
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
UnicodeSetIterator iterates over the contents of a UnicodeSet.
Definition: usetiter.h:61
void * UClassID
UClassID is used to identify classes without using the compiler&#39;s RTTI.
Definition: uobject.h:96
virtual UBool matchesIndexValue(uint8_t v) const =0
Returns TRUE if this matcher will match a character c, where c &amp; 0xFF == v, at offset, in the forward direction (with limit &gt; offset).
C API: Unicode Set.
An interface that defines both lookup protocol and parsing of symbolic names.
Definition: symtable.h:54
virtual UClassID getDynamicClassID(void) const =0
Returns a unique class ID polymorphically.
virtual UnicodeFunctor * clone() const =0
Return a copy of this object.
Replaceable is an abstract base class representing a string of characters that supports the replaceme...
Definition: rep.h:71
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129
UnicodeFilter defines a protocol for selecting a subset of the full range (U+0000 to U+10FFFF) of Uni...
Definition: unifilt.h:59
UChar * getBuffer(int32_t minCapacity)
Get a read/write pointer to the internal buffer.
virtual void addMatchSetTo(UnicodeSet &toUnionTo) const =0
Union the set of all characters that may be matched by this object into the given set...
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:218
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:298
#define NULL
Define NULL if necessary, to 0 for C++ and to ((void *)0) for C.
Definition: utypes.h:186
UnicodeFunctor is an abstract base class for objects that perform match and/or replace operations on ...
Definition: unifunct.h:33
virtual UMatchDegree matches(const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental)
Implement UnicodeMatcher API.
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:273
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:150
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
Definition: umachine.h:278
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
struct USet USet
Definition: ucnv.h:67
UProperty
Selection constants for Unicode properties.
Definition: uchar.h:161
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:476
ParsePosition is a simple class used by Format and its subclasses to keep track of the current positi...
Definition: parsepos.h:47
virtual UnicodeString & toPattern(UnicodeString &result, UBool escapeUnprintable=FALSE) const =0
Returns a string representation of this matcher.
virtual UBool contains(UChar32 c) const =0
Returns true for characters that are in the selected subset.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:208
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:357
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:246
C++ API: Unicode Filter.
int32_t length(void) const
Return the length of the UnicodeString object.
Definition: unistr.h:3610
int8_t UBool
The ICU boolean type.
Definition: umachine.h:200