ICU 50.1.2  50.1.2
regex.h
Go to the documentation of this file.
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: regex.h
7 * encoding: US-ASCII
8 * indentation:4
9 *
10 * created on: 2002oct22
11 * created by: Andy Heninger
12 *
13 * ICU Regular Expressions, API for C++
14 */
15 
16 #ifndef REGEX_H
17 #define REGEX_H
18 
19 //#define REGEX_DEBUG
20 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
48 
49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h"
51 #include "unicode/utext.h"
52 #include "unicode/parseerr.h"
53 
54 #include "unicode/uregex.h"
55 
56 // Forward Declarations
57 
59 
60 struct Regex8BitSet;
61 class RegexCImpl;
62 class RegexMatcher;
63 class RegexPattern;
64 struct REStackFrame;
65 class RuleBasedBreakIterator;
66 class UnicodeSet;
67 class UVector;
68 class UVector32;
69 class UVector64;
70 
75 #ifdef REGEX_DEBUG
76 U_INTERNAL void U_EXPORT2
77  RegexPatternDump(const RegexPattern *pat);
78 #else
79  #undef RegexPatternDump
80  #define RegexPatternDump(pat)
81 #endif
82 
83 
84 
97 public:
98 
106  RegexPattern();
107 
114  RegexPattern(const RegexPattern &source);
115 
121  virtual ~RegexPattern();
122 
131  UBool operator==(const RegexPattern& that) const;
132 
141  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
142 
148  RegexPattern &operator =(const RegexPattern &source);
149 
157  virtual RegexPattern *clone() const;
158 
159 
184  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
185  UParseError &pe,
186  UErrorCode &status);
187 
214  static RegexPattern * U_EXPORT2 compile( UText *regex,
215  UParseError &pe,
216  UErrorCode &status);
217 
242  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
243  uint32_t flags,
244  UParseError &pe,
245  UErrorCode &status);
246 
273  static RegexPattern * U_EXPORT2 compile( UText *regex,
274  uint32_t flags,
275  UParseError &pe,
276  UErrorCode &status);
277 
300  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
301  uint32_t flags,
302  UErrorCode &status);
303 
328  static RegexPattern * U_EXPORT2 compile( UText *regex,
329  uint32_t flags,
330  UErrorCode &status);
331 
337  virtual uint32_t flags() const;
338 
356  virtual RegexMatcher *matcher(const UnicodeString &input,
357  UErrorCode &status) const;
358 
359 private:
373  RegexMatcher *matcher(const UChar *input,
374  UErrorCode &status) const;
375 public:
376 
377 
389  virtual RegexMatcher *matcher(UErrorCode &status) const;
390 
391 
406  static UBool U_EXPORT2 matches(const UnicodeString &regex,
407  const UnicodeString &input,
408  UParseError &pe,
409  UErrorCode &status);
410 
425  static UBool U_EXPORT2 matches(UText *regex,
426  UText *input,
427  UParseError &pe,
428  UErrorCode &status);
429 
438  virtual UnicodeString pattern() const;
439 
440 
451  virtual UText *patternText(UErrorCode &status) const;
452 
453 
492  virtual int32_t split(const UnicodeString &input,
493  UnicodeString dest[],
494  int32_t destCapacity,
495  UErrorCode &status) const;
496 
497 
536  virtual int32_t split(UText *input,
537  UText *dest[],
538  int32_t destCapacity,
539  UErrorCode &status) const;
540 
541 
547  virtual UClassID getDynamicClassID() const;
548 
554  static UClassID U_EXPORT2 getStaticClassID();
555 
556 private:
557  //
558  // Implementation Data
559  //
560  UText *fPattern; // The original pattern string.
561  UnicodeString *fPatternString; // The original pattern UncodeString if relevant
562  uint32_t fFlags; // The flags used when compiling the pattern.
563  //
564  UVector64 *fCompiledPat; // The compiled pattern p-code.
565  UnicodeString fLiteralText; // Any literal string data from the pattern,
566  // after un-escaping, for use during the match.
567 
568  UVector *fSets; // Any UnicodeSets referenced from the pattern.
569  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
570 
571 
572  UErrorCode fDeferredStatus; // status if some prior error has left this
573  // RegexPattern in an unusable state.
574 
575  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
576  // >= this value. For some patterns, this calculated
577  // value may be less than the true shortest
578  // possible match.
579 
580  int32_t fFrameSize; // Size of a state stack frame in the
581  // execution engine.
582 
583  int32_t fDataSize; // The size of the data needed by the pattern that
584  // does not go on the state stack, but has just
585  // a single copy per matcher.
586 
587  UVector32 *fGroupMap; // Map from capture group number to position of
588  // the group's variables in the matcher stack frame.
589 
590  int32_t fMaxCaptureDigits;
591 
592  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
593  // regex character classes, e.g. Word.
594 
595  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
596  // sets for predefined regex classes.
597 
598  int32_t fStartType; // Info on how a match must start.
599  int32_t fInitialStringIdx; //
600  int32_t fInitialStringLen;
601  UnicodeSet *fInitialChars;
602  UChar32 fInitialChar;
603  Regex8BitSet *fInitialChars8;
604  UBool fNeedsAltInput;
605 
606  friend class RegexCompile;
607  friend class RegexMatcher;
608  friend class RegexCImpl;
609 
610  //
611  // Implementation Methods
612  //
613  void init(); // Common initialization, for use by constructors.
614  void zap(); // Common cleanup
615 #ifdef REGEX_DEBUG
616  void dumpOp(int32_t index) const;
617  friend void U_EXPORT2 RegexPatternDump(const RegexPattern *);
618 #endif
619 
620 };
621 
622 
623 
634 public:
635 
650  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
651 
667  RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
668 
690  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
691  uint32_t flags, UErrorCode &status);
692 
714  RegexMatcher(UText *regexp, UText *input,
715  uint32_t flags, UErrorCode &status);
716 
717 private:
731  RegexMatcher(const UnicodeString &regexp, const UChar *input,
732  uint32_t flags, UErrorCode &status);
733 public:
734 
735 
741  virtual ~RegexMatcher();
742 
743 
750  virtual UBool matches(UErrorCode &status);
751 
752 
763  virtual UBool matches(int64_t startIndex, UErrorCode &status);
764 
765 
779  virtual UBool lookingAt(UErrorCode &status);
780 
781 
795  virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
796 
797 
810  virtual UBool find();
811 
812 
822  virtual UBool find(int64_t start, UErrorCode &status);
823 
824 
834  virtual UnicodeString group(UErrorCode &status) const;
835 
836 
849  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
850 
851 
857  virtual int32_t groupCount() const;
858 
859 
874  virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
875 
891  virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
892 
908  virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
909 
910 
918  virtual int32_t start(UErrorCode &status) const;
919 
927  virtual int64_t start64(UErrorCode &status) const;
928 
929 
943  virtual int32_t start(int32_t group, UErrorCode &status) const;
944 
958  virtual int64_t start64(int32_t group, UErrorCode &status) const;
959 
960 
974  virtual int32_t end(UErrorCode &status) const;
975 
989  virtual int64_t end64(UErrorCode &status) const;
990 
991 
1009  virtual int32_t end(int32_t group, UErrorCode &status) const;
1010 
1028  virtual int64_t end64(int32_t group, UErrorCode &status) const;
1029 
1030 
1039  virtual RegexMatcher &reset();
1040 
1041 
1057  virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1058 
1059 
1077  virtual RegexMatcher &reset(const UnicodeString &input);
1078 
1079 
1093  virtual RegexMatcher &reset(UText *input);
1094 
1095 
1120  virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1121 
1122 private:
1136  RegexMatcher &reset(const UChar *input);
1137 public:
1138 
1146  virtual const UnicodeString &input() const;
1147 
1156  virtual UText *inputText() const;
1157 
1168  virtual UText *getInput(UText *dest, UErrorCode &status) const;
1169 
1170 
1189  virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1190 
1202  virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1203 
1212  virtual int32_t regionStart() const;
1213 
1222  virtual int64_t regionStart64() const;
1223 
1224 
1233  virtual int32_t regionEnd() const;
1234 
1243  virtual int64_t regionEnd64() const;
1244 
1253  virtual UBool hasTransparentBounds() const;
1254 
1273  virtual RegexMatcher &useTransparentBounds(UBool b);
1274 
1275 
1283  virtual UBool hasAnchoringBounds() const;
1284 
1285 
1298  virtual RegexMatcher &useAnchoringBounds(UBool b);
1299 
1300 
1313  virtual UBool hitEnd() const;
1314 
1324  virtual UBool requireEnd() const;
1325 
1326 
1332  virtual const RegexPattern &pattern() const;
1333 
1334 
1351  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1352 
1353 
1374  virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1375 
1376 
1397  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1398 
1399 
1424  virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1425 
1426 
1454  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1455  const UnicodeString &replacement, UErrorCode &status);
1456 
1457 
1485  virtual RegexMatcher &appendReplacement(UText *dest,
1486  UText *replacement, UErrorCode &status);
1487 
1488 
1499  virtual UnicodeString &appendTail(UnicodeString &dest);
1500 
1501 
1515  virtual UText *appendTail(UText *dest, UErrorCode &status);
1516 
1517 
1541  virtual int32_t split(const UnicodeString &input,
1542  UnicodeString dest[],
1543  int32_t destCapacity,
1544  UErrorCode &status);
1545 
1546 
1570  virtual int32_t split(UText *input,
1571  UText *dest[],
1572  int32_t destCapacity,
1573  UErrorCode &status);
1574 
1596  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1597 
1604  virtual int32_t getTimeLimit() const;
1605 
1627  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1628 
1636  virtual int32_t getStackLimit() const;
1637 
1638 
1652  virtual void setMatchCallback(URegexMatchCallback *callback,
1653  const void *context,
1654  UErrorCode &status);
1655 
1656 
1667  virtual void getMatchCallback(URegexMatchCallback *&callback,
1668  const void *&context,
1669  UErrorCode &status);
1670 
1671 
1685  virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
1686  const void *context,
1687  UErrorCode &status);
1688 
1689 
1700  virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
1701  const void *&context,
1702  UErrorCode &status);
1703 
1704 #ifndef U_HIDE_INTERNAL_API
1705 
1710  void setTrace(UBool state);
1711 #endif /* U_HIDE_INTERNAL_API */
1712 
1718  static UClassID U_EXPORT2 getStaticClassID();
1719 
1725  virtual UClassID getDynamicClassID() const;
1726 
1727 private:
1728  // Constructors and other object boilerplate are private.
1729  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1730  RegexMatcher(); // default constructor not implemented
1731  RegexMatcher(const RegexPattern *pat);
1732  RegexMatcher(const RegexMatcher &other);
1733  RegexMatcher &operator =(const RegexMatcher &rhs);
1734  void init(UErrorCode &status); // Common initialization
1735  void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
1736 
1737  friend class RegexPattern;
1738  friend class RegexCImpl;
1739 public:
1740 #ifndef U_HIDE_INTERNAL_API
1741 
1742  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1743 #endif /* U_HIDE_INTERNAL_API */
1744 private:
1745 
1746  //
1747  // MatchAt This is the internal interface to the match engine itself.
1748  // Match status comes back in matcher member variables.
1749  //
1750  void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1751  inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
1752  UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
1753  UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
1754  REStackFrame *resetStack();
1755  inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1756  void IncrementTime(UErrorCode &status);
1757  UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status);
1758 
1759  int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1760 
1761  UBool findUsingChunk();
1762  void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1763  UBool isChunkWordBoundary(int32_t pos);
1764 
1765  const RegexPattern *fPattern;
1766  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1767  // should delete it when through.
1768 
1769  const UnicodeString *fInput; // The string being matched. Only used for input()
1770  UText *fInputText; // The text being matched. Is never NULL.
1771  UText *fAltInputText; // A shallow copy of the text being matched.
1772  // Only created if the pattern contains backreferences.
1773  int64_t fInputLength; // Full length of the input text.
1774  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1775 
1776  int64_t fRegionStart; // Start of the input region, default = 0.
1777  int64_t fRegionLimit; // End of input region, default to input.length.
1778 
1779  int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1780  int64_t fAnchorLimit; // See useAnchoringBounds
1781 
1782  int64_t fLookStart; // Region bounds for look-ahead/behind and
1783  int64_t fLookLimit; // and other boundary tests. See
1784  // useTransparentBounds
1785 
1786  int64_t fActiveStart; // Currently active bounds for matching.
1787  int64_t fActiveLimit; // Usually is the same as region, but
1788  // is changed to fLookStart/Limit when
1789  // entering look around regions.
1790 
1791  UBool fTransparentBounds; // True if using transparent bounds.
1792  UBool fAnchoringBounds; // True if using anchoring bounds.
1793 
1794  UBool fMatch; // True if the last attempted match was successful.
1795  int64_t fMatchStart; // Position of the start of the most recent match
1796  int64_t fMatchEnd; // First position after the end of the most recent match
1797  // Zero if no previous match, even when a region
1798  // is active.
1799  int64_t fLastMatchEnd; // First position after the end of the previous match,
1800  // or -1 if there was no previous match.
1801  int64_t fAppendPosition; // First position after the end of the previous
1802  // appendReplacement(). As described by the
1803  // JavaDoc for Java Matcher, where it is called
1804  // "append position"
1805  UBool fHitEnd; // True if the last match touched the end of input.
1806  UBool fRequireEnd; // True if the last match required end-of-input
1807  // (matched $ or Z)
1808 
1809  UVector64 *fStack;
1810  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1811  // which will contain the capture group results.
1812  // NOT valid while match engine is running.
1813 
1814  int64_t *fData; // Data area for use by the compiled pattern.
1815  int64_t fSmallData[8]; // Use this for data if it's enough.
1816 
1817  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1818  // match engine run. Zero for unlimited.
1819 
1820  int32_t fTime; // Match time, accumulates while matching.
1821  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1822  // Kept separately from fTime to keep as much
1823  // code as possible out of the inline
1824  // StateSave function.
1825 
1826  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1827  // stack, in bytes. Zero for unlimited.
1828 
1829  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1830  // NULL if there is no callback.
1831  const void *fCallbackContext; // User Context ptr for callback function.
1832 
1833  URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
1834  // NULL if there is no callback.
1835  const void *fFindProgressCallbackContext; // User Context ptr for callback function.
1836 
1837 
1838  UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1839 
1840  UBool fTraceDebug; // Set true for debug tracing of match engine.
1841 
1842  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1843  // reported, or that permanently disables this matcher.
1844 
1845  RuleBasedBreakIterator *fWordBreakItr;
1846 };
1847 
1849 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1850 #endif
#define RegexPatternDump(pat)
RBBIPatternDump Debug function, displays the compiled form of a pattern.
Definition: regex.h:80
virtual UClassID getDynamicClassID() const =0
ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class.
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:96
#define U_INTERNAL
This is used to declare a function as an internal ICU C API.
Definition: umachine.h:117
Class RegexPattern represents a compiled regular expression.
Definition: regex.h:96
UBool URegexFindProgressCallback(const void *context, int64_t matchIndex)
Function pointer for a regular expression find callback function.
Definition: uregex.h:1550
C API: Abstract Unicode Text API.
class RegexMatcher bundles together a regular expression pattern and input text to which the expressi...
Definition: regex.h:633
#define U_I18N_API
Set to export library symbols from inside the i18n library, and to import them from outside...
Definition: utypes.h:358
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129
C API: Regular Expressions.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:298
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:273
C++ API: Common ICU base class UObject.
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
Definition: umachine.h:278
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
UBool URegexMatchCallback(const void *context, int32_t steps)
Function pointer for a regular expression matching callback function.
Definition: uregex.h:1476
C API: Parse Error Information.
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:476
UBool operator!=(const RegexPattern &that) const
Comparison operator.
Definition: regex.h:141
UText struct.
Definition: utext.h:1343
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:65
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:56
Basic definitions for ICU, for both C and C++ APIs.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:246
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:229
int8_t UBool
The ICU boolean type.
Definition: umachine.h:200