libstdc++
regex.tcc
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex.tcc
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 // See below __regex_algo_impl to get what this is talking about. The default
00032 // value 1 indicated a conservative optimization without giving up worst case
00033 // performance.
00034 #ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
00035 #define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
00036 #endif
00037 
00038 namespace std _GLIBCXX_VISIBILITY(default)
00039 {
00040 namespace __detail
00041 {
00042 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00043 
00044   // Result of merging regex_match and regex_search.
00045   //
00046   // __policy now can be _S_auto (auto dispatch) and _S_alternate (use
00047   // the other one if possible, for test purpose).
00048   //
00049   // That __match_mode is true means regex_match, else regex_search.
00050   template<typename _BiIter, typename _Alloc,
00051        typename _CharT, typename _TraitsT,
00052        _RegexExecutorPolicy __policy,
00053        bool __match_mode>
00054     bool
00055     __regex_algo_impl(_BiIter                              __s,
00056               _BiIter                              __e,
00057               match_results<_BiIter, _Alloc>&      __m,
00058               const basic_regex<_CharT, _TraitsT>& __re,
00059               regex_constants::match_flag_type     __flags)
00060     {
00061       if (__re._M_automaton == nullptr)
00062     return false;
00063 
00064       typename match_results<_BiIter, _Alloc>::_Base_type& __res = __m;
00065       __res.resize(__re._M_automaton->_M_sub_count() + 2);
00066       for (auto& __it : __res)
00067     __it.matched = false;
00068 
00069       // This function decide which executor to use under given circumstances.
00070       // The _S_auto policy now is the following: if a NFA has no
00071       // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
00072       // quantifiers (*, +, ?), the BFS executor will be used, other wise
00073       // DFS executor. This is because DFS executor has a exponential upper
00074       // bound, but better best-case performace. Meanwhile, BFS executor can
00075       // effectively prevent from exponential-long time matching (which must
00076       // contains many quantifiers), but it's slower in average.
00077       //
00078       // For simple regex, BFS executor could be 2 or more times slower than
00079       // DFS executor.
00080       //
00081       // Of course, BFS executor cannot handle back-references.
00082       bool __ret;
00083       if (!__re._M_automaton->_M_has_backref
00084       && (__policy == _RegexExecutorPolicy::_S_alternate
00085           || __re._M_automaton->_M_quant_count
00086         > _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
00087     {
00088       _Executor<_BiIter, _Alloc, _TraitsT, false>
00089         __executor(__s, __e, __m, __re, __flags);
00090       if (__match_mode)
00091         __ret = __executor._M_match();
00092       else
00093         __ret = __executor._M_search();
00094     }
00095       else
00096     {
00097       _Executor<_BiIter, _Alloc, _TraitsT, true>
00098         __executor(__s, __e, __m, __re, __flags);
00099       if (__match_mode)
00100         __ret = __executor._M_match();
00101       else
00102         __ret = __executor._M_search();
00103     }
00104       if (__ret)
00105     {
00106       for (auto __it : __res)
00107         if (!__it.matched)
00108           __it.first = __it.second = __e;
00109       auto& __pre = __res[__res.size()-2];
00110       auto& __suf = __res[__res.size()-1];
00111       if (__match_mode)
00112         {
00113           __pre.matched = false;
00114           __pre.first = __s;
00115           __pre.second = __s;
00116           __suf.matched = false;
00117           __suf.first = __e;
00118           __suf.second = __e;
00119         }
00120       else
00121         {
00122           __pre.first = __s;
00123           __pre.second = __res[0].first;
00124           __pre.matched = (__pre.first != __pre.second);
00125           __suf.first = __res[0].second;
00126           __suf.second = __e;
00127           __suf.matched = (__suf.first != __suf.second);
00128         }
00129     }
00130       return __ret;
00131     }
00132 
00133 _GLIBCXX_END_NAMESPACE_VERSION
00134 }
00135 
00136 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00137 
00138   template<typename _Ch_type>
00139   template<typename _Fwd_iter>
00140     typename regex_traits<_Ch_type>::string_type
00141     regex_traits<_Ch_type>::
00142     lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
00143     {
00144       typedef std::ctype<char_type> __ctype_type;
00145       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
00146 
00147       static const char* __collatenames[] =
00148     {
00149       "NUL",
00150       "SOH",
00151       "STX",
00152       "ETX",
00153       "EOT",
00154       "ENQ",
00155       "ACK",
00156       "alert",
00157       "backspace",
00158       "tab",
00159       "newline",
00160       "vertical-tab",
00161       "form-feed",
00162       "carriage-return",
00163       "SO",
00164       "SI",
00165       "DLE",
00166       "DC1",
00167       "DC2",
00168       "DC3",
00169       "DC4",
00170       "NAK",
00171       "SYN",
00172       "ETB",
00173       "CAN",
00174       "EM",
00175       "SUB",
00176       "ESC",
00177       "IS4",
00178       "IS3",
00179       "IS2",
00180       "IS1",
00181       "space",
00182       "exclamation-mark",
00183       "quotation-mark",
00184       "number-sign",
00185       "dollar-sign",
00186       "percent-sign",
00187       "ampersand",
00188       "apostrophe",
00189       "left-parenthesis",
00190       "right-parenthesis",
00191       "asterisk",
00192       "plus-sign",
00193       "comma",
00194       "hyphen",
00195       "period",
00196       "slash",
00197       "zero",
00198       "one",
00199       "two",
00200       "three",
00201       "four",
00202       "five",
00203       "six",
00204       "seven",
00205       "eight",
00206       "nine",
00207       "colon",
00208       "semicolon",
00209       "less-than-sign",
00210       "equals-sign",
00211       "greater-than-sign",
00212       "question-mark",
00213       "commercial-at",
00214       "A",
00215       "B",
00216       "C",
00217       "D",
00218       "E",
00219       "F",
00220       "G",
00221       "H",
00222       "I",
00223       "J",
00224       "K",
00225       "L",
00226       "M",
00227       "N",
00228       "O",
00229       "P",
00230       "Q",
00231       "R",
00232       "S",
00233       "T",
00234       "U",
00235       "V",
00236       "W",
00237       "X",
00238       "Y",
00239       "Z",
00240       "left-square-bracket",
00241       "backslash",
00242       "right-square-bracket",
00243       "circumflex",
00244       "underscore",
00245       "grave-accent",
00246       "a",
00247       "b",
00248       "c",
00249       "d",
00250       "e",
00251       "f",
00252       "g",
00253       "h",
00254       "i",
00255       "j",
00256       "k",
00257       "l",
00258       "m",
00259       "n",
00260       "o",
00261       "p",
00262       "q",
00263       "r",
00264       "s",
00265       "t",
00266       "u",
00267       "v",
00268       "w",
00269       "x",
00270       "y",
00271       "z",
00272       "left-curly-bracket",
00273       "vertical-line",
00274       "right-curly-bracket",
00275       "tilde",
00276       "DEL",
00277       ""
00278     };
00279 
00280       // same as boost
00281       //static const char* __digraphs[] =
00282       //  {
00283       //    "ae",
00284       //    "Ae",
00285       //    "AE",
00286       //    "ch",
00287       //    "Ch",
00288       //    "CH",
00289       //    "ll",
00290       //    "Ll",
00291       //    "LL",
00292       //    "ss",
00293       //    "Ss",
00294       //    "SS",
00295       //    "nj",
00296       //    "Nj",
00297       //    "NJ",
00298       //    "dz",
00299       //    "Dz",
00300       //    "DZ",
00301       //    "lj",
00302       //    "Lj",
00303       //    "LJ",
00304       //    ""
00305       //  };
00306 
00307       std::string __s(__last - __first, '?');
00308       __fctyp.narrow(__first, __last, '?', &*__s.begin());
00309 
00310       for (unsigned int __i = 0; *__collatenames[__i]; __i++)
00311     if (__s == __collatenames[__i])
00312       return string_type(1, __fctyp.widen(static_cast<char>(__i)));
00313 
00314       //for (unsigned int __i = 0; *__digraphs[__i]; __i++)
00315       //  {
00316       //    const char* __now = __digraphs[__i];
00317       //    if (__s == __now)
00318       //      {
00319       //    string_type ret(__s.size(), __fctyp.widen('?'));
00320       //    __fctyp.widen(__now, __now + 2/* ouch */, &*ret.begin());
00321       //    return ret;
00322       //      }
00323       //  }
00324       return string_type();
00325     }
00326 
00327   template<typename _Ch_type>
00328   template<typename _Fwd_iter>
00329     typename regex_traits<_Ch_type>::char_class_type
00330     regex_traits<_Ch_type>::
00331     lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const
00332     {
00333       typedef std::ctype<char_type> __ctype_type;
00334       typedef std::ctype<char> __cctype_type;
00335       typedef const pair<const char*, char_class_type> _ClassnameEntry;
00336       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
00337       const __cctype_type& __cctyp(use_facet<__cctype_type>(_M_locale));
00338 
00339       static _ClassnameEntry __classnames[] =
00340       {
00341     {"d", ctype_base::digit},
00342     {"w", {ctype_base::alnum, _RegexMask::_S_under}},
00343     {"s", ctype_base::space},
00344     {"alnum", ctype_base::alnum},
00345     {"alpha", ctype_base::alpha},
00346     {"blank", {0, _RegexMask::_S_blank}},
00347     {"cntrl", ctype_base::cntrl},
00348     {"digit", ctype_base::digit},
00349     {"graph", ctype_base::graph},
00350     {"lower", ctype_base::lower},
00351     {"print", ctype_base::print},
00352     {"punct", ctype_base::punct},
00353     {"space", ctype_base::space},
00354     {"upper", ctype_base::upper},
00355     {"xdigit", ctype_base::xdigit},
00356       };
00357 
00358       std::string __s(__last - __first, '?');
00359       __fctyp.narrow(__first, __last, '?', &__s[0]);
00360       __cctyp.tolower(&*__s.begin(), &*__s.begin() + __s.size());
00361       for (_ClassnameEntry* __it = __classnames;
00362        __it < *(&__classnames + 1);
00363        ++__it)
00364     {
00365       if (__s == __it->first)
00366         {
00367           if (__icase
00368           && ((__it->second
00369                & (ctype_base::lower | ctype_base::upper)) != 0))
00370         return ctype_base::alpha;
00371           return __it->second;
00372         }
00373     }
00374       return 0;
00375     }
00376 
00377   template<typename _Ch_type>
00378     bool
00379     regex_traits<_Ch_type>::
00380     isctype(_Ch_type __c, char_class_type __f) const
00381     {
00382       typedef std::ctype<char_type> __ctype_type;
00383       const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
00384 
00385       return __fctyp.is(__f._M_base, __c)
00386     // [[:w:]]
00387     || ((__f._M_extended & _RegexMask::_S_under)
00388         && __c == __fctyp.widen('_'))
00389     // [[:blank:]]
00390     || ((__f._M_extended & _RegexMask::_S_blank)
00391         && (__c == __fctyp.widen(' ')
00392         || __c == __fctyp.widen('\t')));
00393     }
00394 
00395   template<typename _Ch_type>
00396     int
00397     regex_traits<_Ch_type>::
00398     value(_Ch_type __ch, int __radix) const
00399     {
00400       std::basic_istringstream<char_type> __is(string_type(1, __ch));
00401       long __v;
00402       if (__radix == 8)
00403     __is >> std::oct;
00404       else if (__radix == 16)
00405     __is >> std::hex;
00406       __is >> __v;
00407       return __is.fail() ? -1 : __v;
00408     }
00409 
00410   template<typename _Bi_iter, typename _Alloc>
00411   template<typename _Out_iter>
00412     _Out_iter match_results<_Bi_iter, _Alloc>::
00413     format(_Out_iter __out,
00414        const match_results<_Bi_iter, _Alloc>::char_type* __fmt_first,
00415        const match_results<_Bi_iter, _Alloc>::char_type* __fmt_last,
00416        match_flag_type __flags) const
00417     {
00418       _GLIBCXX_DEBUG_ASSERT( ready() );
00419       regex_traits<char_type> __traits;
00420       typedef std::ctype<char_type> __ctype_type;
00421       const __ctype_type&
00422     __fctyp(use_facet<__ctype_type>(__traits.getloc()));
00423 
00424       auto __output = [&](size_t __idx)
00425     {
00426       auto& __sub = _Base_type::operator[](__idx);
00427       if (__sub.matched)
00428         __out = std::copy(__sub.first, __sub.second, __out);
00429     };
00430 
00431       if (__flags & regex_constants::format_sed)
00432     {
00433       for (; __fmt_first != __fmt_last;)
00434         if (*__fmt_first == '&')
00435           {
00436         __output(0);
00437         ++__fmt_first;
00438           }
00439         else if (*__fmt_first == '\\')
00440           {
00441         if (++__fmt_first != __fmt_last
00442             && __fctyp.is(__ctype_type::digit, *__fmt_first))
00443           __output(__traits.value(*__fmt_first++, 10));
00444         else
00445           *__out++ = '\\';
00446           }
00447         else
00448           *__out++ = *__fmt_first++;
00449     }
00450       else
00451     {
00452       while (1)
00453         {
00454           auto __next = std::find(__fmt_first, __fmt_last, '$');
00455           if (__next == __fmt_last)
00456         break;
00457 
00458           __out = std::copy(__fmt_first, __next, __out);
00459 
00460           auto __eat = [&](char __ch) -> bool
00461         {
00462           if (*__next == __ch)
00463             {
00464               ++__next;
00465               return true;
00466             }
00467           return false;
00468         };
00469 
00470           if (++__next == __fmt_last)
00471         *__out++ = '$';
00472           else if (__eat('$'))
00473         *__out++ = '$';
00474           else if (__eat('&'))
00475         __output(0);
00476           else if (__eat('`'))
00477         __output(_Base_type::size()-2);
00478           else if (__eat('\''))
00479         __output(_Base_type::size()-1);
00480           else if (__fctyp.is(__ctype_type::digit, *__next))
00481         {
00482           long __num = __traits.value(*__next, 10);
00483           if (++__next != __fmt_last
00484               && __fctyp.is(__ctype_type::digit, *__next))
00485             {
00486               __num *= 10;
00487               __num += __traits.value(*__next++, 10);
00488             }
00489           if (0 <= __num && __num < this->size())
00490             __output(__num);
00491         }
00492           else
00493         *__out++ = '$';
00494           __fmt_first = __next;
00495         }
00496       __out = std::copy(__fmt_first, __fmt_last, __out);
00497     }
00498       return __out;
00499     }
00500 
00501   template<typename _Out_iter, typename _Bi_iter,
00502        typename _Rx_traits, typename _Ch_type>
00503     _Out_iter
00504     regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last,
00505           const basic_regex<_Ch_type, _Rx_traits>& __e,
00506           const _Ch_type* __fmt,
00507           regex_constants::match_flag_type __flags)
00508     {
00509       typedef regex_iterator<_Bi_iter, _Ch_type, _Rx_traits> _IterT;
00510       _IterT __i(__first, __last, __e, __flags);
00511       _IterT __end;
00512       if (__i == __end)
00513     {
00514       if (!(__flags & regex_constants::format_no_copy))
00515         __out = std::copy(__first, __last, __out);
00516     }
00517       else
00518     {
00519       sub_match<_Bi_iter> __last;
00520       auto __len = char_traits<_Ch_type>::length(__fmt);
00521       for (; __i != __end; ++__i)
00522         {
00523           if (!(__flags & regex_constants::format_no_copy))
00524         __out = std::copy(__i->prefix().first, __i->prefix().second,
00525                   __out);
00526           __out = __i->format(__out, __fmt, __fmt + __len, __flags);
00527           __last = __i->suffix();
00528           if (__flags & regex_constants::format_first_only)
00529         break;
00530         }
00531       if (!(__flags & regex_constants::format_no_copy))
00532         __out = std::copy(__last.first, __last.second, __out);
00533     }
00534       return __out;
00535     }
00536 
00537   template<typename _Bi_iter,
00538        typename _Ch_type,
00539        typename _Rx_traits>
00540     bool
00541     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00542     operator==(const regex_iterator& __rhs) const
00543     {
00544       return (_M_match.empty() && __rhs._M_match.empty())
00545     || (_M_begin == __rhs._M_begin
00546         && _M_end == __rhs._M_end
00547         && _M_pregex == __rhs._M_pregex
00548         && _M_flags == __rhs._M_flags
00549         && _M_match[0] == __rhs._M_match[0]);
00550     }
00551 
00552   template<typename _Bi_iter,
00553        typename _Ch_type,
00554        typename _Rx_traits>
00555     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
00556     regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00557     operator++()
00558     {
00559       // In all cases in which the call to regex_search returns true,
00560       // match.prefix().first shall be equal to the previous value of
00561       // match[0].second, and for each index i in the half-open range
00562       // [0, match.size()) for which match[i].matched is true,
00563       // match[i].position() shall return distance(begin, match[i].first).
00564       // [28.12.1.4.5]
00565       if (_M_match[0].matched)
00566     {
00567       auto __start = _M_match[0].second;
00568       auto __prefix_first = _M_match[0].second;
00569       if (_M_match[0].first == _M_match[0].second)
00570         {
00571           if (__start == _M_end)
00572         {
00573           _M_match = value_type();
00574           return *this;
00575         }
00576           else
00577         {
00578           if (regex_search(__start, _M_end, _M_match, *_M_pregex,
00579                    _M_flags
00580                    | regex_constants::match_not_null
00581                    | regex_constants::match_continuous))
00582             {
00583               _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
00584               _M_match.at(_M_match.size()).first = __prefix_first;
00585               _M_match._M_in_iterator = true;
00586               _M_match._M_begin = _M_begin;
00587               return *this;
00588             }
00589           else
00590             ++__start;
00591         }
00592         }
00593       _M_flags |= regex_constants::match_prev_avail;
00594       if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags))
00595         {
00596           _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
00597           _M_match.at(_M_match.size()).first = __prefix_first;
00598           _M_match._M_in_iterator = true;
00599           _M_match._M_begin = _M_begin;
00600         }
00601       else
00602         _M_match = value_type();
00603     }
00604       return *this;
00605     }
00606 
00607   template<typename _Bi_iter,
00608        typename _Ch_type,
00609        typename _Rx_traits>
00610     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
00611     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00612     operator=(const regex_token_iterator& __rhs)
00613     {
00614       _M_position = __rhs._M_position;
00615       _M_subs = __rhs._M_subs;
00616       _M_n = __rhs._M_n;
00617       _M_result = __rhs._M_result;
00618       _M_suffix = __rhs._M_suffix;
00619       _M_has_m1 = __rhs._M_has_m1;
00620       if (__rhs._M_result == &__rhs._M_suffix)
00621     _M_result = &_M_suffix;
00622       return *this;
00623     }
00624 
00625   template<typename _Bi_iter,
00626        typename _Ch_type,
00627        typename _Rx_traits>
00628     bool
00629     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00630     operator==(const regex_token_iterator& __rhs) const
00631     {
00632       if (_M_end_of_seq() && __rhs._M_end_of_seq())
00633     return true;
00634       if (_M_suffix.matched && __rhs._M_suffix.matched
00635       && _M_suffix == __rhs._M_suffix)
00636     return true;
00637       if (_M_end_of_seq() || _M_suffix.matched
00638       || __rhs._M_end_of_seq() || __rhs._M_suffix.matched)
00639     return false;
00640       return _M_position == __rhs._M_position
00641     && _M_n == __rhs._M_n
00642     && _M_subs == __rhs._M_subs;
00643     }
00644 
00645   template<typename _Bi_iter,
00646        typename _Ch_type,
00647        typename _Rx_traits>
00648     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
00649     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00650     operator++()
00651     {
00652       _Position __prev = _M_position;
00653       if (_M_suffix.matched)
00654     *this = regex_token_iterator();
00655       else if (_M_n + 1 < _M_subs.size())
00656     {
00657       _M_n++;
00658       _M_result = &_M_current_match();
00659     }
00660       else
00661     {
00662       _M_n = 0;
00663       ++_M_position;
00664       if (_M_position != _Position())
00665         _M_result = &_M_current_match();
00666       else if (_M_has_m1 && __prev->suffix().length() != 0)
00667         {
00668           _M_suffix.matched = true;
00669           _M_suffix.first = __prev->suffix().first;
00670           _M_suffix.second = __prev->suffix().second;
00671           _M_result = &_M_suffix;
00672         }
00673       else
00674         *this = regex_token_iterator();
00675     }
00676       return *this;
00677     }
00678 
00679   template<typename _Bi_iter,
00680        typename _Ch_type,
00681        typename _Rx_traits>
00682     void
00683     regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
00684     _M_init(_Bi_iter __a, _Bi_iter __b)
00685     {
00686       _M_has_m1 = false;
00687       for (auto __it : _M_subs)
00688     if (__it == -1)
00689       {
00690         _M_has_m1 = true;
00691         break;
00692       }
00693       if (_M_position != _Position())
00694     _M_result = &_M_current_match();
00695       else if (_M_has_m1)
00696     {
00697       _M_suffix.matched = true;
00698       _M_suffix.first = __a;
00699       _M_suffix.second = __b;
00700       _M_result = &_M_suffix;
00701     }
00702       else
00703     _M_result = nullptr;
00704     }
00705 
00706 _GLIBCXX_END_NAMESPACE_VERSION
00707 } // namespace
00708