001/* Matcher.java -- Instance of a regular expression applied to a char sequence.
002   Copyright (C) 2002, 2004, 2006 Free Software Foundation, Inc.
003
004This file is part of GNU Classpath.
005
006GNU Classpath is free software; you can redistribute it and/or modify
007it under the terms of the GNU General Public License as published by
008the Free Software Foundation; either version 2, or (at your option)
009any later version.
010
011GNU Classpath is distributed in the hope that it will be useful, but
012WITHOUT ANY WARRANTY; without even the implied warranty of
013MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014General Public License for more details.
015
016You should have received a copy of the GNU General Public License
017along with GNU Classpath; see the file COPYING.  If not, write to the
018Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
01902110-1301 USA.
020
021Linking this library statically or dynamically with other modules is
022making a combined work based on this library.  Thus, the terms and
023conditions of the GNU General Public License cover the whole
024combination.
025
026As a special exception, the copyright holders of this library give you
027permission to link this library with independent modules to produce an
028executable, regardless of the license terms of these independent
029modules, and to copy and distribute the resulting executable under
030terms of your choice, provided that you also meet, for each linked
031independent module, the terms and conditions of the license of that
032module.  An independent module is a module which is not derived from
033or based on this library.  If you modify this library, you may extend
034this exception to your version of the library, but you are not
035obligated to do so.  If you do not wish to do so, delete this
036exception statement from your version. */
037
038
039package java.util.regex;
040
041import gnu.java.lang.CPStringBuilder;
042
043import gnu.java.util.regex.CharIndexed;
044import gnu.java.util.regex.RE;
045import gnu.java.util.regex.REMatch;
046
047/**
048 * Instance of a regular expression applied to a char sequence.
049 *
050 * @since 1.4
051 */
052public final class Matcher implements MatchResult
053{
054  private Pattern pattern;
055  private CharSequence input;
056  // We use CharIndexed as an input object to the getMatch method in order
057  // that /\G/ (the end of the previous match) may work.  The information
058  // of the previous match is stored in the CharIndexed object.
059  private CharIndexed inputCharIndexed;
060  private int position;
061  private int appendPosition;
062  private REMatch match;
063
064  /**
065   * The start of the region of the input on which to match.
066   */
067  private int regionStart;
068
069  /**
070   * The end of the region of the input on which to match.
071   */
072  private int regionEnd;
073  
074  /**
075   * True if the match process should look beyond the 
076   * region marked by regionStart to regionEnd when
077   * performing lookAhead, lookBehind and boundary
078   * matching.
079   */
080  private boolean transparentBounds;
081
082  /**
083   * The flags that affect the anchoring bounds.
084   * If {@link #hasAnchoringBounds()} is {@code true},
085   * the match process will honour the
086   * anchoring bounds: ^, \A, \Z, \z and $.  If
087   * {@link #hasAnchoringBounds()} is {@code false},
088   * the anchors are ignored and appropriate flags,
089   * stored in this variable, are used to provide this
090   * behaviour.
091   */
092  private int anchoringBounds;
093
094  Matcher(Pattern pattern, CharSequence input)
095  {
096    this.pattern = pattern;
097    this.input = input;
098    this.inputCharIndexed = RE.makeCharIndexed(input, 0);
099    regionStart = 0;
100    regionEnd = input.length();
101    transparentBounds = false;
102    anchoringBounds = 0;
103  }
104  
105  /**
106   * @param sb The target string buffer
107   * @param replacement The replacement string
108   *
109   * @exception IllegalStateException If no match has yet been attempted,
110   * or if the previous match operation failed
111   * @exception IndexOutOfBoundsException If the replacement string refers
112   * to a capturing group that does not exist in the pattern
113   */
114  public Matcher appendReplacement (StringBuffer sb, String replacement)
115    throws IllegalStateException
116  {
117    assertMatchOp();
118    sb.append(input.subSequence(appendPosition,
119                                match.getStartIndex()).toString());
120    sb.append(RE.getReplacement(replacement, match,
121        RE.REG_REPLACE_USE_BACKSLASHESCAPE));
122    appendPosition = match.getEndIndex();
123    return this;
124  }
125
126  /**
127   * @param sb The target string buffer
128   */
129  public StringBuffer appendTail (StringBuffer sb)
130  {
131    sb.append(input.subSequence(appendPosition, input.length()).toString());
132    return sb;
133  }
134 
135  /**
136   * @exception IllegalStateException If no match has yet been attempted,
137   * or if the previous match operation failed
138   */
139  public int end ()
140    throws IllegalStateException
141  {
142    assertMatchOp();
143    return match.getEndIndex();
144  }
145  
146  /**
147   * @param group The index of a capturing group in this matcher's pattern
148   *
149   * @exception IllegalStateException If no match has yet been attempted,
150   * or if the previous match operation failed
151   * @exception IndexOutOfBoundsException If the replacement string refers
152   * to a capturing group that does not exist in the pattern
153   */
154  public int end (int group)
155    throws IllegalStateException
156  {
157    assertMatchOp();
158    return match.getEndIndex(group);
159  }
160 
161  public boolean find ()
162  {
163    boolean first = (match == null);
164    if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
165      match = pattern.getRE().getMatch(inputCharIndexed, position, anchoringBounds);
166    else
167      match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
168                                       position, anchoringBounds);
169    if (match != null)
170      {
171        int endIndex = match.getEndIndex();
172        // Are we stuck at the same position?
173        if (!first && endIndex == position)
174          {         
175            match = null;
176            // Not at the end of the input yet?
177            if (position < input.length() - 1)
178              {
179                position++;
180                return find(position);
181              }
182            else
183              return false;
184          }
185        position = endIndex;
186        return true;
187      }
188    return false;
189  } 
190
191  /**
192   * @param start The index to start the new pattern matching
193   *
194   * @exception IndexOutOfBoundsException If the replacement string refers
195   * to a capturing group that does not exist in the pattern
196   */
197  public boolean find (int start)
198  {
199    if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
200      match = pattern.getRE().getMatch(inputCharIndexed, start, anchoringBounds);
201    else
202      match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
203                                       start, anchoringBounds);
204    if (match != null)
205      {
206        position = match.getEndIndex();
207        return true;
208      }
209    return false;
210  }
211 
212  /**
213   * @exception IllegalStateException If no match has yet been attempted,
214   * or if the previous match operation failed
215   */
216  public String group ()
217  {
218    assertMatchOp();
219    return match.toString();
220  }
221  
222  /**
223   * @param group The index of a capturing group in this matcher's pattern
224   *
225   * @exception IllegalStateException If no match has yet been attempted,
226   * or if the previous match operation failed
227   * @exception IndexOutOfBoundsException If the replacement string refers
228   * to a capturing group that does not exist in the pattern
229   */
230  public String group (int group)
231    throws IllegalStateException
232  {
233    assertMatchOp();
234    return match.toString(group);
235  }
236
237  /**
238   * @param replacement The replacement string
239   */
240  public String replaceFirst (String replacement)
241  {
242    reset();
243    // Semantics might not quite match
244    return pattern.getRE().substitute(input, replacement, position,
245        RE.REG_REPLACE_USE_BACKSLASHESCAPE);
246  }
247
248  /**
249   * @param replacement The replacement string
250   */
251  public String replaceAll (String replacement)
252  {
253    reset();
254    return pattern.getRE().substituteAll(input, replacement, position,
255        RE.REG_REPLACE_USE_BACKSLASHESCAPE);
256  }
257  
258  public int groupCount ()
259  {
260    return pattern.getRE().getNumSubs();
261  }
262 
263  public boolean lookingAt ()
264  {
265    if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
266      match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
267                                       anchoringBounds|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
268    else
269      match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
270                                       anchoringBounds|RE.REG_FIX_STARTING_POSITION);
271    if (match != null)
272      {
273        if (match.getStartIndex() == 0)
274          {
275            position = match.getEndIndex();
276            return true;
277          }
278        match = null;
279      }
280    return false;
281  }
282  
283  /**
284   * Attempts to match the entire input sequence against the pattern. 
285   *
286   * If the match succeeds then more information can be obtained via the
287   * start, end, and group methods.
288   *
289   * @see #start()
290   * @see #end()
291   * @see #group()
292   */
293  public boolean matches ()
294  {
295    if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
296      match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
297                                       anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
298    else
299      match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
300                                       anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION);
301    if (match != null)
302      {
303        if (match.getStartIndex() == 0)
304          {
305            position = match.getEndIndex();
306            if (position == input.length())
307                return true;
308          }
309        match = null;
310      }
311    return false;
312  }
313  
314  /**
315   * Returns the Pattern that is interpreted by this Matcher
316   */
317  public Pattern pattern ()
318  {
319    return pattern;
320  }
321  
322  /**
323   * Resets the internal state of the matcher, including
324   * resetting the region to its default state of encompassing
325   * the whole input.  The state of {@link #hasTransparentBounds()}
326   * and {@link #hasAnchoringBounds()} are unaffected.
327   *
328   * @return a reference to this matcher.
329   * @see #regionStart()
330   * @see #regionEnd()
331   * @see #hasTransparentBounds()
332   * @see #hasAnchoringBounds()
333   */
334  public Matcher reset ()
335  {
336    position = 0;
337    match = null;
338    regionStart = 0;
339    regionEnd = input.length();
340    appendPosition = 0;
341    return this;
342  }
343  
344  /**
345   * Resets the internal state of the matcher, including
346   * resetting the region to its default state of encompassing
347   * the whole input.  The state of {@link #hasTransparentBounds()}
348   * and {@link #hasAnchoringBounds()} are unaffected.
349   *
350   * @param input The new input character sequence.
351   * @return a reference to this matcher.
352   * @see #regionStart()
353   * @see #regionEnd()
354   * @see #hasTransparentBounds()
355   * @see #hasAnchoringBounds()
356   */
357  public Matcher reset (CharSequence input)
358  {
359    this.input = input;
360    this.inputCharIndexed = RE.makeCharIndexed(input, 0);
361    return reset();
362  }
363  
364  /**
365   * @return the index of a capturing group in this matcher's pattern
366   *
367   * @exception IllegalStateException If no match has yet been attempted,
368   * or if the previous match operation failed
369   */
370  public int start ()
371    throws IllegalStateException
372  {
373    assertMatchOp();
374    return match.getStartIndex();
375  }
376
377  /**
378   * @param group The index of a capturing group in this matcher's pattern
379   *
380   * @exception IllegalStateException If no match has yet been attempted,
381   * or if the previous match operation failed
382   * @exception IndexOutOfBoundsException If the replacement string refers
383   * to a capturing group that does not exist in the pattern
384   */
385  public int start (int group)
386    throws IllegalStateException
387  {
388    assertMatchOp();
389    return match.getStartIndex(group);
390  }
391
392  /**
393   * @return True if and only if the matcher hit the end of input.
394   * @since 1.5
395   */
396  public boolean hitEnd()
397  {
398    return inputCharIndexed.hitEnd();
399  }
400
401  /**
402   * @return A string expression of this matcher.
403   */
404  public String toString()
405  {
406    CPStringBuilder sb = new CPStringBuilder();
407    sb.append(this.getClass().getName())
408      .append("[pattern=").append(pattern.pattern())
409      .append(" region=").append(regionStart).append(",").append(regionEnd)
410      .append(" anchoringBounds=").append(anchoringBounds == 0)
411      .append(" transparentBounds=").append(transparentBounds)
412      .append(" lastmatch=").append(match == null ? "" : match.toString())
413      .append("]");
414    return sb.toString();
415  }
416
417  private void assertMatchOp()
418  {
419    if (match == null) throw new IllegalStateException();
420  }
421
422  /**
423   * <p>
424   * Defines the region of the input on which to match.
425   * By default, the {@link Matcher} attempts to match
426   * the whole string (from 0 to the length of the input),
427   * but a region between {@code start} (inclusive) and
428   * {@code end} (exclusive) on which to match may instead
429   * be defined using this method.
430   * </p>
431   * <p>
432   * The behaviour of region matching is further affected
433   * by the use of transparent or opaque bounds (see
434   * {@link #useTransparentBounds(boolean)}) and whether or not
435   * anchors ({@code ^} and {@code $}) are in use
436   * (see {@link #useAnchoringBounds(boolean)}).  With transparent
437   * bounds, the matcher is aware of input outside the bounds
438   * set by this method, whereas, with opaque bounds (the default)
439   * only the input within the bounds is used.  The use of
440   * anchors are affected by this setting; with transparent
441   * bounds, anchors will match the beginning of the real input,
442   * while with opaque bounds they match the beginning of the
443   * region.  {@link #useAnchoringBounds(boolean)} can be used
444   * to turn on or off the matching of anchors.
445   * </p>
446   *
447   * @param start the start of the region (inclusive).
448   * @param end the end of the region (exclusive).
449   * @return a reference to this matcher.
450   * @throws IndexOutOfBoundsException if either {@code start} or
451   *                                   {@code end} are less than zero,
452   *                                   if either {@code start} or
453   *                                   {@code end} are greater than the
454   *                                   length of the input, or if
455   *                                   {@code start} is greater than
456   *                                   {@code end}.
457   * @see #regionStart()
458   * @see #regionEnd()
459   * @see #hasTransparentBounds()
460   * @see #useTransparentBounds(boolean)
461   * @see #hasAnchoringBounds()
462   * @see #useAnchoringBounds(boolean)
463   * @since 1.5
464   */
465  public Matcher region(int start, int end)
466  {
467    int length = input.length();
468    if (start < 0)
469      throw new IndexOutOfBoundsException("The start position was less than zero.");
470    if (start >= length)
471      throw new IndexOutOfBoundsException("The start position is after the end of the input.");
472    if (end < 0)
473      throw new IndexOutOfBoundsException("The end position was less than zero.");
474    if (end > length)
475      throw new IndexOutOfBoundsException("The end position is after the end of the input.");
476    if (start > end)
477      throw new IndexOutOfBoundsException("The start position is after the end position.");
478    reset();
479    regionStart = start;
480    regionEnd = end;
481    return this;
482  }
483
484  /**
485   * The start of the region on which to perform matches (inclusive).
486   *
487   * @return the start index of the region.
488   * @see #region(int,int)
489   * #see #regionEnd()
490   * @since 1.5
491   */
492  public int regionStart()
493  {
494    return regionStart;
495  }
496  
497  /**
498   * The end of the region on which to perform matches (exclusive).
499   *
500   * @return the end index of the region.
501   * @see #region(int,int)
502   * @see #regionStart()
503   * @since 1.5
504   */
505  public int regionEnd()
506  {
507    return regionEnd;
508  }
509
510  /**
511   * Returns true if the bounds of the region marked by
512   * {@link #regionStart()} and {@link #regionEnd()} are
513   * transparent.  When these bounds are transparent, the
514   * matching process can look beyond them to perform
515   * lookahead, lookbehind and boundary matching operations.
516   * By default, the bounds are opaque.
517   *
518   * @return true if the bounds of the matching region are
519   *         transparent.
520   * @see #useTransparentBounds(boolean)
521   * @see #region(int,int)
522   * @see #regionStart()
523   * @see #regionEnd()
524   * @since 1.5
525   */
526  public boolean hasTransparentBounds()
527  {
528    return transparentBounds;
529  }
530
531  /**
532   * Sets the transparency of the bounds of the region
533   * marked by {@link #regionStart()} and {@link #regionEnd()}.
534   * A value of {@code true} makes the bounds transparent,
535   * so the matcher can see beyond them to perform lookahead,
536   * lookbehind and boundary matching operations.  A value
537   * of {@code false} (the default) makes the bounds opaque,
538   * restricting the match to the input region denoted
539   * by {@link #regionStart()} and {@link #regionEnd()}.
540   *
541   * @param transparent true if the bounds should be transparent.
542   * @return a reference to this matcher.
543   * @see #hasTransparentBounds()
544   * @see #region(int,int)
545   * @see #regionStart()
546   * @see #regionEnd()
547   * @since 1.5
548   */
549  public Matcher useTransparentBounds(boolean transparent)
550  {
551    transparentBounds = transparent;
552    return this;
553  }
554
555  /**
556   * Returns true if the matcher will honour the use of
557   * the anchoring bounds: {@code ^}, {@code \A}, {@code \Z},
558   * {@code \z} and {@code $}.  By default, the anchors
559   * are used.  Note that the effect of the anchors is
560   * also affected by {@link #hasTransparentBounds()}.
561   *
562   * @return true if the matcher will attempt to match
563   *         the anchoring bounds.
564   * @see #useAnchoringBounds(boolean)
565   * @see #hasTransparentBounds()
566   * @since 1.5
567   */
568  public boolean hasAnchoringBounds()
569  {
570    return anchoringBounds == 0;
571  }
572
573  /**
574   * Enables or disables the use of the anchoring bounds:
575   * {@code ^}, {@code \A}, {@code \Z}, {@code \z} and
576   * {@code $}. By default, their use is enabled.  When
577   * disabled, the matcher will not attempt to match
578   * the anchors.
579   *
580   * @param useAnchors true if anchoring bounds should be used.
581   * @return a reference to this matcher.
582   * @since 1.5
583   * @see #hasAnchoringBounds()
584   */
585  public Matcher useAnchoringBounds(boolean useAnchors)
586  {
587    if (useAnchors)
588      anchoringBounds = 0;
589    else
590      anchoringBounds = RE.REG_NOTBOL|RE.REG_NOTEOL;
591    return this;
592  }
593
594  /**
595   * Returns a read-only snapshot of the current state of
596   * the {@link Matcher} as a {@link MatchResult}.  Any
597   * subsequent changes to this instance are not reflected
598   * in the returned {@link MatchResult}.
599   *
600   * @return a {@link MatchResult} instance representing the
601   *         current state of the {@link Matcher}.
602   */
603  public MatchResult toMatchResult()
604  {
605    Matcher snapshot = new Matcher(pattern, input);
606    snapshot.match = (REMatch) match.clone();
607    return snapshot;
608  }
609
610}