libstdc++
regex.tcc
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex.tcc
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // See below __regex_algo_impl to get what this is talking about. The default
32 // value 1 indicated a conservative optimization without giving up worst case
33 // performance.
34 #ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
35 #define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
36 #endif
37 
38 namespace std _GLIBCXX_VISIBILITY(default)
39 {
40 namespace __detail
41 {
42 _GLIBCXX_BEGIN_NAMESPACE_VERSION
43 
44  // Result of merging regex_match and regex_search.
45  //
46  // __policy now can be _S_auto (auto dispatch) and _S_alternate (use
47  // the other one if possible, for test purpose).
48  //
49  // That __match_mode is true means regex_match, else regex_search.
50  template<typename _BiIter, typename _Alloc,
51  typename _CharT, typename _TraitsT,
52  _RegexExecutorPolicy __policy,
53  bool __match_mode>
54  bool
55  __regex_algo_impl(_BiIter __s,
56  _BiIter __e,
57  match_results<_BiIter, _Alloc>& __m,
58  const basic_regex<_CharT, _TraitsT>& __re,
60  {
61  if (__re._M_automaton == nullptr)
62  return false;
63 
64  typename match_results<_BiIter, _Alloc>::_Base_type& __res = __m;
65  __m._M_begin = __s;
66  __res.resize(__re._M_automaton->_M_sub_count() + 2);
67  for (auto& __it : __res)
68  __it.matched = false;
69 
70  // This function decide which executor to use under given circumstances.
71  // The _S_auto policy now is the following: if a NFA has no
72  // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
73  // quantifiers (*, +, ?), the BFS executor will be used, other wise
74  // DFS executor. This is because DFS executor has a exponential upper
75  // bound, but better best-case performace. Meanwhile, BFS executor can
76  // effectively prevent from exponential-long time matching (which must
77  // contains many quantifiers), but it's slower in average.
78  //
79  // For simple regex, BFS executor could be 2 or more times slower than
80  // DFS executor.
81  //
82  // Of course, BFS executor cannot handle back-references.
83  bool __ret;
84  if (!__re._M_automaton->_M_has_backref
85  && (__policy == _RegexExecutorPolicy::_S_alternate
86  || __re._M_automaton->_M_quant_count
87  > _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
88  {
89  _Executor<_BiIter, _Alloc, _TraitsT, false>
90  __executor(__s, __e, __m, __re, __flags);
91  if (__match_mode)
92  __ret = __executor._M_match();
93  else
94  __ret = __executor._M_search();
95  }
96  else
97  {
98  _Executor<_BiIter, _Alloc, _TraitsT, true>
99  __executor(__s, __e, __m, __re, __flags);
100  if (__match_mode)
101  __ret = __executor._M_match();
102  else
103  __ret = __executor._M_search();
104  }
105  if (__ret)
106  {
107  for (auto __it : __res)
108  if (!__it.matched)
109  __it.first = __it.second = __e;
110  auto& __pre = __res[__res.size()-2];
111  auto& __suf = __res[__res.size()-1];
112  if (__match_mode)
113  {
114  __pre.matched = false;
115  __pre.first = __s;
116  __pre.second = __s;
117  __suf.matched = false;
118  __suf.first = __e;
119  __suf.second = __e;
120  }
121  else
122  {
123  __pre.first = __s;
124  __pre.second = __res[0].first;
125  __pre.matched = (__pre.first != __pre.second);
126  __suf.first = __res[0].second;
127  __suf.second = __e;
128  __suf.matched = (__suf.first != __suf.second);
129  }
130  }
131  return __ret;
132  }
133 
134 _GLIBCXX_END_NAMESPACE_VERSION
135 }
136 
137 _GLIBCXX_BEGIN_NAMESPACE_VERSION
138 
139  template<typename _Ch_type>
140  template<typename _Fwd_iter>
141  typename regex_traits<_Ch_type>::string_type
143  lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
144  {
145  typedef std::ctype<char_type> __ctype_type;
146  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
147 
148  static const char* __collatenames[] =
149  {
150  "NUL",
151  "SOH",
152  "STX",
153  "ETX",
154  "EOT",
155  "ENQ",
156  "ACK",
157  "alert",
158  "backspace",
159  "tab",
160  "newline",
161  "vertical-tab",
162  "form-feed",
163  "carriage-return",
164  "SO",
165  "SI",
166  "DLE",
167  "DC1",
168  "DC2",
169  "DC3",
170  "DC4",
171  "NAK",
172  "SYN",
173  "ETB",
174  "CAN",
175  "EM",
176  "SUB",
177  "ESC",
178  "IS4",
179  "IS3",
180  "IS2",
181  "IS1",
182  "space",
183  "exclamation-mark",
184  "quotation-mark",
185  "number-sign",
186  "dollar-sign",
187  "percent-sign",
188  "ampersand",
189  "apostrophe",
190  "left-parenthesis",
191  "right-parenthesis",
192  "asterisk",
193  "plus-sign",
194  "comma",
195  "hyphen",
196  "period",
197  "slash",
198  "zero",
199  "one",
200  "two",
201  "three",
202  "four",
203  "five",
204  "six",
205  "seven",
206  "eight",
207  "nine",
208  "colon",
209  "semicolon",
210  "less-than-sign",
211  "equals-sign",
212  "greater-than-sign",
213  "question-mark",
214  "commercial-at",
215  "A",
216  "B",
217  "C",
218  "D",
219  "E",
220  "F",
221  "G",
222  "H",
223  "I",
224  "J",
225  "K",
226  "L",
227  "M",
228  "N",
229  "O",
230  "P",
231  "Q",
232  "R",
233  "S",
234  "T",
235  "U",
236  "V",
237  "W",
238  "X",
239  "Y",
240  "Z",
241  "left-square-bracket",
242  "backslash",
243  "right-square-bracket",
244  "circumflex",
245  "underscore",
246  "grave-accent",
247  "a",
248  "b",
249  "c",
250  "d",
251  "e",
252  "f",
253  "g",
254  "h",
255  "i",
256  "j",
257  "k",
258  "l",
259  "m",
260  "n",
261  "o",
262  "p",
263  "q",
264  "r",
265  "s",
266  "t",
267  "u",
268  "v",
269  "w",
270  "x",
271  "y",
272  "z",
273  "left-curly-bracket",
274  "vertical-line",
275  "right-curly-bracket",
276  "tilde",
277  "DEL",
278  };
279 
280  string __s;
281  for (; __first != __last; ++__first)
282  __s += __fctyp.narrow(*__first, 0);
283 
284  for (const auto& __it : __collatenames)
285  if (__s == __it)
286  return string_type(1, __fctyp.widen(
287  static_cast<char>(&__it - __collatenames)));
288 
289  // TODO Add digraph support:
290  // http://boost.sourceforge.net/libs/regex/doc/collating_names.html
291 
292  return string_type();
293  }
294 
295  template<typename _Ch_type>
296  template<typename _Fwd_iter>
297  typename regex_traits<_Ch_type>::char_class_type
299  lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const
300  {
301  typedef std::ctype<char_type> __ctype_type;
302  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
303 
304  // Mappings from class name to class mask.
305  static const pair<const char*, char_class_type> __classnames[] =
306  {
307  {"d", ctype_base::digit},
308  {"w", {ctype_base::alnum, _RegexMask::_S_under}},
309  {"s", ctype_base::space},
310  {"alnum", ctype_base::alnum},
311  {"alpha", ctype_base::alpha},
312  {"blank", {0, _RegexMask::_S_blank}},
313  {"cntrl", ctype_base::cntrl},
314  {"digit", ctype_base::digit},
315  {"graph", ctype_base::graph},
316  {"lower", ctype_base::lower},
317  {"print", ctype_base::print},
318  {"punct", ctype_base::punct},
319  {"space", ctype_base::space},
320  {"upper", ctype_base::upper},
321  {"xdigit", ctype_base::xdigit},
322  };
323 
324  string __s;
325  for (; __first != __last; ++__first)
326  __s += __fctyp.narrow(__fctyp.tolower(*__first), 0);
327 
328  for (const auto& __it : __classnames)
329  if (__s == __it.first)
330  {
331  if (__icase
332  && ((__it.second
333  & (ctype_base::lower | ctype_base::upper)) != 0))
334  return ctype_base::alpha;
335  return __it.second;
336  }
337  return 0;
338  }
339 
340  template<typename _Ch_type>
341  bool
343  isctype(_Ch_type __c, char_class_type __f) const
344  {
345  typedef std::ctype<char_type> __ctype_type;
346  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
347 
348  return __fctyp.is(__f._M_base, __c)
349  // [[:w:]]
350  || ((__f._M_extended & _RegexMask::_S_under)
351  && __c == __fctyp.widen('_'))
352  // [[:blank:]]
353  || ((__f._M_extended & _RegexMask::_S_blank)
354  && (__c == __fctyp.widen(' ')
355  || __c == __fctyp.widen('\t')));
356  }
357 
358  template<typename _Ch_type>
359  int
361  value(_Ch_type __ch, int __radix) const
362  {
364  long __v;
365  if (__radix == 8)
366  __is >> std::oct;
367  else if (__radix == 16)
368  __is >> std::hex;
369  __is >> __v;
370  return __is.fail() ? -1 : __v;
371  }
372 
373  template<typename _Bi_iter, typename _Alloc>
374  template<typename _Out_iter>
376  format(_Out_iter __out,
377  const match_results<_Bi_iter, _Alloc>::char_type* __fmt_first,
378  const match_results<_Bi_iter, _Alloc>::char_type* __fmt_last,
379  match_flag_type __flags) const
380  {
381  _GLIBCXX_DEBUG_ASSERT( ready() );
382  regex_traits<char_type> __traits;
383  typedef std::ctype<char_type> __ctype_type;
384  const __ctype_type&
385  __fctyp(use_facet<__ctype_type>(__traits.getloc()));
386 
387  auto __output = [&](size_t __idx)
388  {
389  auto& __sub = _Base_type::operator[](__idx);
390  if (__sub.matched)
391  __out = std::copy(__sub.first, __sub.second, __out);
392  };
393 
394  if (__flags & regex_constants::format_sed)
395  {
396  for (; __fmt_first != __fmt_last;)
397  if (*__fmt_first == '&')
398  {
399  __output(0);
400  ++__fmt_first;
401  }
402  else if (*__fmt_first == '\\')
403  {
404  if (++__fmt_first != __fmt_last
405  && __fctyp.is(__ctype_type::digit, *__fmt_first))
406  __output(__traits.value(*__fmt_first++, 10));
407  else
408  *__out++ = '\\';
409  }
410  else
411  *__out++ = *__fmt_first++;
412  }
413  else
414  {
415  while (1)
416  {
417  auto __next = std::find(__fmt_first, __fmt_last, '$');
418  if (__next == __fmt_last)
419  break;
420 
421  __out = std::copy(__fmt_first, __next, __out);
422 
423  auto __eat = [&](char __ch) -> bool
424  {
425  if (*__next == __ch)
426  {
427  ++__next;
428  return true;
429  }
430  return false;
431  };
432 
433  if (++__next == __fmt_last)
434  *__out++ = '$';
435  else if (__eat('$'))
436  *__out++ = '$';
437  else if (__eat('&'))
438  __output(0);
439  else if (__eat('`'))
440  __output(_Base_type::size()-2);
441  else if (__eat('\''))
442  __output(_Base_type::size()-1);
443  else if (__fctyp.is(__ctype_type::digit, *__next))
444  {
445  long __num = __traits.value(*__next, 10);
446  if (++__next != __fmt_last
447  && __fctyp.is(__ctype_type::digit, *__next))
448  {
449  __num *= 10;
450  __num += __traits.value(*__next++, 10);
451  }
452  if (0 <= __num && __num < this->size())
453  __output(__num);
454  }
455  else
456  *__out++ = '$';
457  __fmt_first = __next;
458  }
459  __out = std::copy(__fmt_first, __fmt_last, __out);
460  }
461  return __out;
462  }
463 
464  template<typename _Out_iter, typename _Bi_iter,
465  typename _Rx_traits, typename _Ch_type>
466  _Out_iter
467  regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last,
469  const _Ch_type* __fmt,
471  {
473  _IterT __i(__first, __last, __e, __flags);
474  _IterT __end;
475  if (__i == __end)
476  {
477  if (!(__flags & regex_constants::format_no_copy))
478  __out = std::copy(__first, __last, __out);
479  }
480  else
481  {
482  sub_match<_Bi_iter> __last;
483  auto __len = char_traits<_Ch_type>::length(__fmt);
484  for (; __i != __end; ++__i)
485  {
486  if (!(__flags & regex_constants::format_no_copy))
487  __out = std::copy(__i->prefix().first, __i->prefix().second,
488  __out);
489  __out = __i->format(__out, __fmt, __fmt + __len, __flags);
490  __last = __i->suffix();
492  break;
493  }
494  if (!(__flags & regex_constants::format_no_copy))
495  __out = std::copy(__last.first, __last.second, __out);
496  }
497  return __out;
498  }
499 
500  template<typename _Bi_iter,
501  typename _Ch_type,
502  typename _Rx_traits>
503  bool
505  operator==(const regex_iterator& __rhs) const
506  {
507  return (_M_match.empty() && __rhs._M_match.empty())
508  || (_M_begin == __rhs._M_begin
509  && _M_end == __rhs._M_end
510  && _M_pregex == __rhs._M_pregex
511  && _M_flags == __rhs._M_flags
512  && _M_match[0] == __rhs._M_match[0]);
513  }
514 
515  template<typename _Bi_iter,
516  typename _Ch_type,
517  typename _Rx_traits>
521  {
522  // In all cases in which the call to regex_search returns true,
523  // match.prefix().first shall be equal to the previous value of
524  // match[0].second, and for each index i in the half-open range
525  // [0, match.size()) for which match[i].matched is true,
526  // match[i].position() shall return distance(begin, match[i].first).
527  // [28.12.1.4.5]
528  if (_M_match[0].matched)
529  {
530  auto __start = _M_match[0].second;
531  auto __prefix_first = _M_match[0].second;
532  if (_M_match[0].first == _M_match[0].second)
533  {
534  if (__start == _M_end)
535  {
536  _M_match = value_type();
537  return *this;
538  }
539  else
540  {
541  if (regex_search(__start, _M_end, _M_match, *_M_pregex,
542  _M_flags
545  {
546  _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
547  auto& __prefix = _M_match.at(_M_match.size());
548  __prefix.first = __prefix_first;
549  __prefix.matched = __prefix.first != __prefix.second;
550  // [28.12.1.4.5]
551  _M_match._M_begin = _M_begin;
552  return *this;
553  }
554  else
555  ++__start;
556  }
557  }
559  if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags))
560  {
561  _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
562  auto& __prefix = _M_match.at(_M_match.size());
563  __prefix.first = __prefix_first;
564  __prefix.matched = __prefix.first != __prefix.second;
565  // [28.12.1.4.5]
566  _M_match._M_begin = _M_begin;
567  }
568  else
569  _M_match = value_type();
570  }
571  return *this;
572  }
573 
574  template<typename _Bi_iter,
575  typename _Ch_type,
576  typename _Rx_traits>
580  {
581  _M_position = __rhs._M_position;
582  _M_subs = __rhs._M_subs;
583  _M_n = __rhs._M_n;
584  _M_suffix = __rhs._M_suffix;
585  _M_has_m1 = __rhs._M_has_m1;
586  _M_normalize_result();
587  return *this;
588  }
589 
590  template<typename _Bi_iter,
591  typename _Ch_type,
592  typename _Rx_traits>
593  bool
596  {
597  if (_M_end_of_seq() && __rhs._M_end_of_seq())
598  return true;
599  if (_M_suffix.matched && __rhs._M_suffix.matched
600  && _M_suffix == __rhs._M_suffix)
601  return true;
602  if (_M_end_of_seq() || _M_suffix.matched
603  || __rhs._M_end_of_seq() || __rhs._M_suffix.matched)
604  return false;
605  return _M_position == __rhs._M_position
606  && _M_n == __rhs._M_n
607  && _M_subs == __rhs._M_subs;
608  }
609 
610  template<typename _Bi_iter,
611  typename _Ch_type,
612  typename _Rx_traits>
616  {
617  _Position __prev = _M_position;
618  if (_M_suffix.matched)
619  *this = regex_token_iterator();
620  else if (_M_n + 1 < _M_subs.size())
621  {
622  _M_n++;
623  _M_result = &_M_current_match();
624  }
625  else
626  {
627  _M_n = 0;
628  ++_M_position;
629  if (_M_position != _Position())
630  _M_result = &_M_current_match();
631  else if (_M_has_m1 && __prev->suffix().length() != 0)
632  {
633  _M_suffix.matched = true;
634  _M_suffix.first = __prev->suffix().first;
635  _M_suffix.second = __prev->suffix().second;
636  _M_result = &_M_suffix;
637  }
638  else
639  *this = regex_token_iterator();
640  }
641  return *this;
642  }
643 
644  template<typename _Bi_iter,
645  typename _Ch_type,
646  typename _Rx_traits>
647  void
649  _M_init(_Bi_iter __a, _Bi_iter __b)
650  {
651  _M_has_m1 = false;
652  for (auto __it : _M_subs)
653  if (__it == -1)
654  {
655  _M_has_m1 = true;
656  break;
657  }
658  if (_M_position != _Position())
659  _M_result = &_M_current_match();
660  else if (_M_has_m1)
661  {
662  _M_suffix.matched = true;
663  _M_suffix.first = __a;
664  _M_suffix.second = __b;
665  _M_result = &_M_suffix;
666  }
667  else
668  _M_result = nullptr;
669  }
670 
671 _GLIBCXX_END_NAMESPACE_VERSION
672 } // namespace
673 
regex_token_iterator & operator=(const regex_token_iterator &__rhs)
Assigns a regex_token_iterator to another.
Definition: regex.tcc:579
Describes aspects of a regular expression.
Definition: regex.h:91
_Out_iter regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last, const basic_regex< _Ch_type, _Rx_traits > &__e, const basic_string< _Ch_type, _St, _Sa > &__fmt, regex_constants::match_flag_type __flags=regex_constants::match_default)
Search for a regular expression within a range for multiple times, and replace the matched parts thro...
Definition: regex.h:2274
Controlling input for std::string.
Definition: iosfwd:97
int value(_Ch_type __ch, int __radix) const
Converts a digit to an int.
Definition: regex.tcc:361
Basis for explicit traits specializations.
Definition: char_traits.h:227
match_flag_type
This is a bitmask type indicating regex matching rules.
ios_base & oct(ios_base &__base)
Calls base.setf(ios_base::oct, ios_base::basefield).
Definition: ios_base.h:949
bool regex_search(_Bi_iter __s, _Bi_iter __e, match_results< _Bi_iter, _Alloc > &__m, const basic_regex< _Ch_type, _Rx_traits > &__re, regex_constants::match_flag_type __flags=regex_constants::match_default)
Definition: regex.h:2140
constexpr size_t size() const noexcept
Returns the total number of bits.
Definition: bitset:1293
bool fail() const
Fast error checking.
Definition: basic_ios.h:195
regex_token_iterator & operator++()
Increments a regex_token_iterator.
Definition: regex.tcc:615
_T2 second
first is a copy of the first object
Definition: stl_pair.h:102
char_class_type lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase=false) const
Maps one or more characters to a named character classification.
bool isctype(_Ch_type __c, char_class_type __f) const
Determines if c is a member of an identified class.
Definition: regex.tcc:343
bool operator==(const regex_token_iterator &__rhs) const
Compares a regex_token_iterator to another for equality.
Definition: regex.tcc:595
_T1 first
second_type is the second bound type
Definition: stl_pair.h:101
Managing sequences of characters and character-like objects.
Definition: basic_string.h:112
bool operator==(const regex_iterator &__rhs) const
Tests the equivalence of two regex iterators.
Definition: regex.tcc:505
_Out_iter format(_Out_iter __out, const char_type *__fmt_first, const char_type *__fmt_last, match_flag_type __flags=regex_constants::format_default) const
string_type lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
Gets a collation element by name.
ios_base & hex(ios_base &__base)
Calls base.setf(ios_base::hex, ios_base::basefield).
Definition: ios_base.h:941
locale_type getloc() const
Gets a copy of the current locale in use by the regex_traits object.
Definition: regex.h:384
bool empty() const
Indicates if the match_results contains no results.
Definition: regex.h:1665
regex_iterator & operator++()
Increments a regex_iterator.
Definition: regex.tcc:520
reference operator[](size_t __position)
Array-indexing support.
Definition: bitset:1156