Logo Search packages:      
Sourcecode: jing-trang version File versions  Download package

Translator.java

package com.thaiopensource.datatype.xsd.regex.java;

import com.thaiopensource.datatype.xsd.regex.RegexSyntaxException;
import com.thaiopensource.util.Localizer;
import com.thaiopensource.util.Utf16;

import java.math.BigDecimal;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;

/**
 * Translates XML Schema regexes into <code>java.util.regex</code> regexes.
 *
 * @see java.util.regex.Pattern
 * @see <a href="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema Part 2</a>
 */
00019 public class Translator {
  private final String regExp;
  private int pos = 0;
  private final int length;
  private char curChar;
  private boolean eos = false;
  private final StringBuffer result = new StringBuffer();
  static private final boolean surrogatesDirect = RegexFeatures.SURROGATES_DIRECT;

  static private final String categories = "LMNPZSC";
  static private final CharClass[] categoryCharClasses = new CharClass[categories.length()];
  static private final String subCategories = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoZsZlZpSmScSkSoCcCfCoCn";
  static private final CharClass[] subCategoryCharClasses = new CharClass[subCategories.length() / 2];

  static private final int NONBMP_MIN = 0x10000;
  static private final int NONBMP_MAX = 0x10FFFF;
  static private final char SURROGATE2_MIN = '\uDC00';
  static private final char SURROGATE2_MAX = '\uDFFF';

  static final Localizer localizer = new Localizer(Translator.class);

  static private final String[] blockNames = {
    "BasicLatin",
    "Latin-1Supplement",
    "LatinExtended-A",
    "LatinExtended-B",
    "IPAExtensions",
    "SpacingModifierLetters",
    "CombiningDiacriticalMarks",
    "Greek",
    "Cyrillic",
    "Armenian",
    "Hebrew",
    "Arabic",
    "Syriac",
    "Thaana",
    "Devanagari",
    "Bengali",
    "Gurmukhi",
    "Gujarati",
    "Oriya",
    "Tamil",
    "Telugu",
    "Kannada",
    "Malayalam",
    "Sinhala",
    "Thai",
    "Lao",
    "Tibetan",
    "Myanmar",
    "Georgian",
    "HangulJamo",
    "Ethiopic",
    "Cherokee",
    "UnifiedCanadianAboriginalSyllabics",
    "Ogham",
    "Runic",
    "Khmer",
    "Mongolian",
    "LatinExtendedAdditional",
    "GreekExtended",
    "GeneralPunctuation",
    "SuperscriptsandSubscripts",
    "CurrencySymbols",
    "CombiningMarksforSymbols",
    "LetterlikeSymbols",
    "NumberForms",
    "Arrows",
    "MathematicalOperators",
    "MiscellaneousTechnical",
    "ControlPictures",
    "OpticalCharacterRecognition",
    "EnclosedAlphanumerics",
    "BoxDrawing",
    "BlockElements",
    "GeometricShapes",
    "MiscellaneousSymbols",
    "Dingbats",
    "BraillePatterns",
    "CJKRadicalsSupplement",
    "KangxiRadicals",
    "IdeographicDescriptionCharacters",
    "CJKSymbolsandPunctuation",
    "Hiragana",
    "Katakana",
    "Bopomofo",
    "HangulCompatibilityJamo",
    "Kanbun",
    "BopomofoExtended",
    "EnclosedCJKLettersandMonths",
    "CJKCompatibility",
    "CJKUnifiedIdeographsExtensionA",
    "CJKUnifiedIdeographs",
    "YiSyllables",
    "YiRadicals",
    "HangulSyllables",
    // surrogates excluded because there are never any *characters* with codes in surrogate range
    // "PrivateUse", excluded because 3.1 adds non-BMP ranges
    "CJKCompatibilityIdeographs",
    "AlphabeticPresentationForms",
    "ArabicPresentationForms-A",
    "CombiningHalfMarks",
    "CJKCompatibilityForms",
    "SmallFormVariants",
    "ArabicPresentationForms-B",
    "Specials",
    "HalfwidthandFullwidthForms",
    "Specials"
  };


  /**
   * Names of blocks including ranges outside the BMP.
   */
00133   static private final String[] specialBlockNames = {
    "OldItalic",
    "Gothic",
    "Deseret",
    "ByzantineMusicalSymbols",
    "MusicalSymbols",
    "MathematicalAlphanumericSymbols",
    "CJKUnifiedIdeographsExtensionB",
    "CJKCompatibilityIdeographsSupplement",
    "Tags",
    "PrivateUse"
  };

  /**
   * CharClass for each block name in specialBlockNames.
   */
00149   static private final CharClass[] specialBlockCharClasses = {
    new CharRange(0x10300, 0x1032F),
    new CharRange(0x10330, 0x1034F),
    new CharRange(0x10400, 0x1044F),
    new CharRange(0x1D000, 0x1D0FF),
    new CharRange(0x1D100, 0x1D1FF),
    new CharRange(0x1D400, 0x1D7FF),
    new CharRange(0x20000, 0x2A6D6),
    new CharRange(0x2F800, 0x2FA1F),
    new CharRange(0xE0000, 0xE007F),
    new Union(new CharClass[] {
      new CharRange(0xE000, 0xF8FF),
      new CharRange(0xF0000, 0xFFFFD),
      new CharRange(0x100000, 0x10FFFD)
    })
  };

  static private final CharClass DOT = new Complement(new Union(new CharClass[] { new SingleChar('\n'), new SingleChar('\r') }));

  static private final CharClass ESC_d = new Property("Nd");

  static private final CharClass ESC_D = new Complement(ESC_d);

  static private final CharClass ESC_W = new Union(new CharClass[] {new Property("P"), new Property("Z"), new Property("C")});

  static private final CharClass ESC_w = new Complement(ESC_W);

  static private final CharClass ESC_s = new Union(new CharClass[] {
    new SingleChar(' '),
    new SingleChar('\n'),
    new SingleChar('\r'),
    new SingleChar('\t')
  });

  static private final CharClass ESC_S = new Complement(ESC_s);

  static private final CharClass ESC_i = makeCharClass(NamingExceptions.NMSTRT_CATEGORIES,
                                                       NamingExceptions.NMSTRT_INCLUDES,
                                                       NamingExceptions.NMSTRT_EXCLUDE_RANGES);

  static private final CharClass ESC_I = new Complement(ESC_i);

  static private final CharClass ESC_c = makeCharClass(NamingExceptions.NMCHAR_CATEGORIES,
                                                       NamingExceptions.NMCHAR_INCLUDES,
                                                       NamingExceptions.NMCHAR_EXCLUDE_RANGES);

  static private final CharClass ESC_C = new Complement(ESC_c);

  static private final char EOS = '\0';

  private Translator(String regExp) {
    this.regExp = regExp;
    this.length = regExp.length();
    advance();
  }

  /**
   * Translates a regular expression in the syntax of XML Schemas Part 2 into a regular
   * expression in the syntax of <code>java.util.regex.Pattern</code>.  The translation
   * assumes that the string to be matched against the regex uses surrogate pairs correctly.
   * If the string comes from XML content, a conforming XML parser will automatically
   * check this; if the string comes from elsewhere, it may be necessary to check
   * surrogate usage before matching.
   *
   * @param regexp a String containing a regular expression in the syntax of XML Schemas Part 2
   * @return a String containing a regular expression in the syntax of java.util.regex.Pattern
   * @throws RegexSyntaxException if <code>regexp</code> is not a regular expression in the
   * syntax of XML Schemas Part 2
   * @see java.util.regex.Pattern
   * @see <a href="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema Part 2</a>
   */
00220   static public String translate(String regexp) throws RegexSyntaxException {
    Translator tr = new Translator(regexp);
    tr.translateTop();
    return tr.result.toString();
  }

  private void advance() {
    if (pos < length)
      curChar = regExp.charAt(pos++);
    else {
      pos++;
      curChar = EOS;
      eos = true;
    }
  }

  private void translateTop() throws RegexSyntaxException {
    translateRegExp();
    if (!eos)
      throw makeException("expected_eos");
  }

  private void translateRegExp() throws RegexSyntaxException {
    translateBranch();
    while (curChar == '|') {
      copyCurChar();
      translateBranch();
    }
  }

  private void translateBranch() throws RegexSyntaxException {
    while (translateAtom())
      translateQuantifier();
  }

  private void translateQuantifier() throws RegexSyntaxException {
    switch (curChar) {
    case '*':
    case '?':
    case '+':
      copyCurChar();
      return;
    case '{':
      copyCurChar();
      translateQuantity();
      expect('}');
      copyCurChar();
    }
  }

  private void translateQuantity() throws RegexSyntaxException {
    String lower = parseQuantExact();
    int lowerValue = -1;
    try {
      lowerValue = Integer.parseInt(lower);
      result.append(lower);
    }
    catch (NumberFormatException e) {
      // JDK 1.4 cannot handle ranges bigger than this
      result.append(Integer.MAX_VALUE);
    }
    if (curChar == ',') {
      copyCurChar();
      if (curChar != '}') {
        String upper = parseQuantExact();
        try {
          int upperValue = Integer.parseInt(upper);
          result.append(upper);
          if (lowerValue < 0 || upperValue < lowerValue)
            throw makeException("invalid_quantity_range");
        }
        catch (NumberFormatException e) {
          result.append(Integer.MAX_VALUE);
          if (lowerValue < 0 && new BigDecimal(lower).compareTo(new BigDecimal(upper)) > 0)
            throw makeException("invalid_quantity_range");
        }
      }
    }
  }

  private String parseQuantExact() throws RegexSyntaxException {
    StringBuffer buf = new StringBuffer();
    do {
      if ("0123456789".indexOf(curChar) < 0)
        throw makeException("expected_digit");
      buf.append(curChar);
      advance();
    } while (curChar != ',' && curChar != '}');
    return buf.toString();
  }

  private void copyCurChar() {
    result.append(curChar);
    advance();
  }

  static final int NONE = -1;
  static final int SOME = 0;
  static final int ALL = 1;

  static final String SURROGATES1_CLASS = "[\uD800-\uDBFF]";
  static final String SURROGATES2_CLASS = "[\uDC00-\uDFFF]";
  static final String NOT_ALLOWED_CLASS = "[\u0000&&[^\u0000]]";

  static final class Range implements Comparable {
    private final int min;
    private final int max;

    Range(int min, int max) {
      this.min = min;
      this.max = max;
    }

    int getMin() {
      return min;
    }

    int getMax() {
      return max;
    }

    public int compareTo(Object o) {
      Range other = (Range)o;
      if (this.min < other.min)
        return -1;
      if (this.min > other.min)
        return 1;
      if (this.max > other.max)
        return -1;
      if (this.max < other.max)
        return 1;
      return 0;
    }
  }

  static abstract class CharClass  {

    private final int containsBmp;
    // if it contains ALL and containsBmp != NONE, then the generated class for containsBmp must
    // contain all the high surrogates
    private final int containsNonBmp;

    protected CharClass(int containsBmp, int containsNonBmp) {
      this.containsBmp = containsBmp;
      this.containsNonBmp = containsNonBmp;
    }

    int getContainsBmp() {
      return containsBmp;
    }

    int getContainsNonBmp() {
      return containsNonBmp;
    }

    final void output(StringBuffer buf) {
      if (surrogatesDirect)
        outputDirect(buf);
      else
        outputMungeSurrogates(buf);
    }

    final void outputMungeSurrogates(StringBuffer buf) {
      switch (containsNonBmp) {
      case NONE:
        if (containsBmp == NONE)
          buf.append(NOT_ALLOWED_CLASS);
        else
          outputDirect(buf);
        break;
      case ALL:
        buf.append('(');
        if (containsBmp == NONE) {
          buf.append(SURROGATES1_CLASS);
          buf.append(SURROGATES2_CLASS);
        }
        else {
          outputDirect(buf);
          buf.append(SURROGATES2_CLASS);
          buf.append('?');
        }
        buf.append(')');
        break;
      case SOME:
        buf.append('(');
        boolean needSep = false;
        if (containsBmp != NONE) {
          needSep = true;
          outputDirect(buf);
        }
        List ranges = new Vector();
        addNonBmpRanges(ranges);
        sortRangeList(ranges);
        String hi = highSurrogateRanges(ranges);
        if (hi.length() > 0) {
          if (needSep)
            buf.append('|');
          else
            needSep = true;
          buf.append('[');
          for (int i = 0, len = hi.length(); i < len; i += 2) {
            char min = hi.charAt(i);
            char max = hi.charAt(i + 1);
            if (min == max)
              buf.append(min);
            else {
              buf.append(min);
              buf.append('-');
              buf.append(max);
            }
          }
          buf.append(']');
          buf.append(SURROGATES2_CLASS);
        }
        String lo = lowSurrogateRanges(ranges);
        for (int i = 0, len = lo.length(); i < len; i += 3) {
          if (needSep)
            buf.append('|');
          else
            needSep = true;
          buf.append(lo.charAt(i));
          char min = lo.charAt(i + 1);
          char max = lo.charAt(i + 2);
          if (min == max && (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i)))
            buf.append(min);
          else {
            buf.append('[');
            for (;;) {
              if (min == max)
                buf.append(min);
              else {
                buf.append(min);
                buf.append('-');
                buf.append(max);
              }
              if (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i))
                break;
              i += 3;
              min = lo.charAt(i + 1);
              max = lo.charAt(i + 2);
            }
            buf.append(']');
          }
        }
        if (!needSep)
          buf.append(NOT_ALLOWED_CLASS);
        buf.append(')');
        break;
      }
    }

    static String highSurrogateRanges(List ranges) {
      StringBuffer highRanges = new StringBuffer();
      for (int i = 0, len = ranges.size(); i < len; i++) {
        Range r = (Range)ranges.get(i);
        char min1 = Utf16.surrogate1(r.getMin());
        char min2 = Utf16.surrogate2(r.getMin());
        char max1 = Utf16.surrogate1(r.getMax());
        char max2 = Utf16.surrogate2(r.getMax());
        if (min2 != SURROGATE2_MIN)
          min1++;
        if (max2 != SURROGATE2_MAX)
          max1--;
        if (max1 >= min1) {
          highRanges.append(min1);
          highRanges.append(max1);
        }
      }
      return highRanges.toString();
    }

    static String lowSurrogateRanges(List ranges) {
      StringBuffer lowRanges = new StringBuffer();
      for (int i = 0, len = ranges.size(); i < len; i++) {
        Range r = (Range)ranges.get(i);
        char min1 = Utf16.surrogate1(r.getMin());
        char min2 = Utf16.surrogate2(r.getMin());
        char max1 = Utf16.surrogate1(r.getMax());
        char max2 = Utf16.surrogate2(r.getMax());
        if (min1 == max1) {
          if (min2 != SURROGATE2_MIN || max2 != SURROGATE2_MAX) {
            lowRanges.append(min1);
            lowRanges.append(min2);
            lowRanges.append(max2);
          }
        }
        else {
          if (min2 != SURROGATE2_MIN) {
            lowRanges.append(min1);
            lowRanges.append(min2);
            lowRanges.append(SURROGATE2_MAX);
          }
          if (max2 != SURROGATE2_MAX) {
            lowRanges.append(max1);
            lowRanges.append(SURROGATE2_MIN);
            lowRanges.append(max2);
          }
        }
      }
      return lowRanges.toString();
    }

    abstract void outputDirect(StringBuffer buf);
    abstract void outputComplementDirect(StringBuffer buf);

    int singleChar() {
      return -1;
    }

    void addNonBmpRanges(List ranges) {
    }


    static void sortRangeList(List ranges) {
      Collections.sort(ranges);
      int toIndex = 0;
      int fromIndex = 0;
      int len = ranges.size();
      while (fromIndex < len) {
        Range r = (Range)ranges.get(fromIndex);
        int min = r.getMin();
        int max = r.getMax();
        while (++fromIndex < len) {
          Range r2 = (Range)ranges.get(fromIndex);
          if (r2.getMin() > max + 1)
            break;
          if (r2.getMax() > max)
            max = r2.getMax();
        }
        if (max != r.getMax())
          r = new Range(min, max);
        ranges.set(toIndex++, r);
      }
      while (len > toIndex)
        ranges.remove(--len);
    }

  }

  static abstract class SimpleCharClass extends CharClass {
    SimpleCharClass(int containsBmp, int containsNonBmp) {
      super(containsBmp, containsNonBmp);
    }

    void outputDirect(StringBuffer buf) {
      buf.append('[');
      inClassOutputDirect(buf);
      buf.append(']');
    }

    // must not call if containsBmp == ALL && !surrogatesDirect
    void outputComplementDirect(StringBuffer buf) {
      if (!surrogatesDirect && getContainsBmp() == NONE)
        buf.append("[\u0000-\uFFFF]");
      else {
        buf.append("[^");
        inClassOutputDirect(buf);
        buf.append(']');
      }
    }
    abstract void inClassOutputDirect(StringBuffer buf);

    static void outputWide(StringBuffer buf, int c) {
      buf.append(Utf16.surrogate1(c));
      buf.append(Utf16.surrogate2(c));
    }
  }

  static class SingleChar extends SimpleCharClass {
    private final char c;
    SingleChar(char c) {
      super(SOME, NONE);
      this.c = c;
    }

    int singleChar() {
      return c;
    }

    void outputDirect(StringBuffer buf) {
      inClassOutputDirect(buf);
    }

    void inClassOutputDirect(StringBuffer buf) {
      if (isJavaMetaChar(c))
        buf.append('\\');
      buf.append(c);
    }

  }

  static class WideSingleChar extends SimpleCharClass {
    private final int c;

    WideSingleChar(int c) {
      super(NONE, SOME);
      this.c = c;
    }

    void inClassOutputDirect(StringBuffer buf) {
      if (!surrogatesDirect)
        throw new RuntimeException("BMP output botch");
      outputWide(buf, c);
    }

    int singleChar() {
      return c;
    }

    void addNonBmpRanges(List ranges) {
      ranges.add(new Range(c, c));
    }
  }

  static class CharRange extends SimpleCharClass {
    private final int lower;
    private final int upper;

    CharRange(int lower, int upper) {
      super(lower < NONBMP_MIN ? SOME : NONE,
            // don't use ALL here, because that requires that the BMP class contains high surrogates
            upper >= NONBMP_MIN ? SOME : NONE);
      this.lower = lower;
      this.upper = upper;
    }

    void inClassOutputDirect(StringBuffer buf) {
      if (lower < NONBMP_MIN) {
        if (isJavaMetaChar((char)lower))
          buf.append('\\');
        buf.append((char)lower);
      }
      else if (surrogatesDirect)
        outputWide(buf, lower);
      else
        throw new RuntimeException("BMP output botch");
      buf.append('-');
      if (upper < NONBMP_MIN) {
        if (isJavaMetaChar((char)upper))
          buf.append('\\');
        buf.append((char)upper);
      }
      else if (surrogatesDirect)
        outputWide(buf, upper);
      else
        buf.append('\uFFFF');
    }

    void addNonBmpRanges(List ranges) {
      if (upper >= NONBMP_MIN)
        ranges.add(new Range(lower < NONBMP_MIN ? NONBMP_MIN : lower, upper));
    }
  }

  static class Property extends SimpleCharClass {
    private final String name;

    Property(String name) {
      super(SOME, NONE);
      this.name = name;
    }

    void outputDirect(StringBuffer buf) {
      inClassOutputDirect(buf);
    }

    void inClassOutputDirect(StringBuffer buf) {
      buf.append("\\p{");
      buf.append(name);
      buf.append('}');
    }

    void outputComplementDirect(StringBuffer buf) {
      buf.append("\\P{");
      buf.append(name);
      buf.append('}');
    }
  }

  static class Subtraction extends CharClass {
    private final CharClass cc1;
    private final CharClass cc2;
    Subtraction(CharClass cc1, CharClass cc2) {
      // min corresponds to intersection
      // complement corresponds to negation
      super(Math.min(cc1.getContainsBmp(), -cc2.getContainsBmp()),
            Math.min(cc1.getContainsNonBmp(), -cc2.getContainsNonBmp()));
      this.cc1 = cc1;
      this.cc2 = cc2;
    }

    void outputDirect(StringBuffer buf) {
      buf.append('[');
      cc1.outputDirect(buf);
      buf.append("&&");
      cc2.outputComplementDirect(buf);
      buf.append(']');
    }

    void outputComplementDirect(StringBuffer buf) {
      buf.append('[');
      cc1.outputComplementDirect(buf);
      cc2.outputDirect(buf);
      buf.append(']');
    }

    void addNonBmpRanges(List ranges) {
      List posList = new Vector();
      cc1.addNonBmpRanges(posList);
      List negList = new Vector();
      cc2.addNonBmpRanges(negList);
      sortRangeList(posList);
      sortRangeList(negList);
      Iterator negIter = negList.iterator();
      Range negRange;
      if (negIter.hasNext())
        negRange = (Range)negIter.next();
      else
        negRange = null;
      for (int i = 0, len = posList.size(); i < len; i++) {
        Range posRange = (Range)posList.get(i);
        while (negRange != null && negRange.getMax() < posRange.getMin()) {
          if (negIter.hasNext())
            negRange = (Range)negIter.next();
          else
            negRange = null;
        }
        // if negRange != null, negRange.max >= posRange.min
        int min = posRange.getMin();
        while (negRange != null && negRange.getMin() <= posRange.getMax()) {
          if (min < negRange.getMin()) {
            ranges.add(new Range(min, negRange.getMin() - 1));
          }
          min = negRange.getMax() + 1;
          if (min > posRange.getMax())
            break;
          if (negIter.hasNext())
            negRange = (Range)negIter.next();
          else
            negRange = null;
        }
        if (min <= posRange.getMax())
          ranges.add(new Range(min, posRange.getMax()));
      }
    }
  }

  static class Union extends CharClass {
    private final List members;

    Union(CharClass[] v) {
      this(toList(v));
    }

    static private List toList(CharClass[] v) {
      List members = new Vector();
      for (int i = 0; i < v.length; i++)
        members.add(v[i]);
      return members;
    }

    Union(List members) {
      super(computeContainsBmp(members), computeContainsNonBmp(members));
      this.members = members;
    }

    void outputDirect(StringBuffer buf) {
      buf.append('[');
      for (int i = 0, len = members.size(); i < len; i++) {
        CharClass cc = (CharClass)members.get(i);
        if (surrogatesDirect || cc.getContainsBmp() != NONE) {
          if (cc instanceof SimpleCharClass)
            ((SimpleCharClass)cc).inClassOutputDirect(buf);
          else
            cc.outputDirect(buf);
        }
      }
      buf.append(']');
    }

    void outputComplementDirect(StringBuffer buf) {
      boolean first = true;
      int len = members.size();
      for (int i = 0; i < len; i++) {
        CharClass cc = (CharClass)members.get(i);
        if ((surrogatesDirect || cc.getContainsBmp() != NONE) && cc instanceof SimpleCharClass) {
          if (first) {
            buf.append("[^");
            first = false;
          }
          ((SimpleCharClass)cc).inClassOutputDirect(buf);
        }
      }
      for (int i = 0; i < len; i++) {
        CharClass cc = (CharClass)members.get(i);
        if ((surrogatesDirect || cc.getContainsBmp() != NONE) && !(cc instanceof SimpleCharClass)) {
          if (first) {
            buf.append('[');
            first = false;
          }
          else
            buf.append("&&");
          // can't have any members that are ALL, because that would make this ALL, which violates
          // the precondition for outputComplementDirect
          cc.outputComplementDirect(buf);
        }
      }
      if (first)
        // all members are NONE, so this is NONE, so complement is everything
        buf.append("[\u0000-\uFFFF]");
      else
        buf.append(']');
    }

    void addNonBmpRanges(List ranges) {
      for (int i = 0, len = members.size(); i < len; i++)
        ((CharClass)members.get(i)).addNonBmpRanges(ranges);
    }

    private static int computeContainsBmp(List members) {
      int ret = NONE;
      for (int i = 0, len = members.size(); i < len; i++)
        ret = Math.max(ret, ((CharClass)members.get(i)).getContainsBmp());
      return ret;
    }

    private static int computeContainsNonBmp(List members) {
      int ret = NONE;
      for (int i = 0, len = members.size(); i < len; i++)
        ret = Math.max(ret, ((CharClass)members.get(i)).getContainsNonBmp());
      return ret;
    }
  }

  static class Complement extends CharClass {
    private final CharClass cc;
    Complement(CharClass cc) {
      super(-cc.getContainsBmp(), -cc.getContainsNonBmp());
      this.cc = cc;
    }

    void outputDirect(StringBuffer buf) {
      cc.outputComplementDirect(buf);
    }

    void outputComplementDirect(StringBuffer buf) {
      cc.outputDirect(buf);
    }

    void addNonBmpRanges(List ranges) {
      List tem = new Vector();
      cc.addNonBmpRanges(tem);
      sortRangeList(tem);
      int c = NONBMP_MIN;
      for (int i = 0, len = tem.size(); i < len; i++) {
        Range r = (Range)tem.get(i);
        if (r.getMin() > c)
          ranges.add(new Range(c, r.getMin() - 1));
        c = r.getMax() + 1;
      }
      if (c != NONBMP_MAX + 1)
        ranges.add(new Range(c, NONBMP_MAX));
    }
  }

  private boolean translateAtom() throws RegexSyntaxException {
    switch (curChar) {
    case EOS:
      if (!eos)
        break;
      // fall through
    case '?':
    case '*':
    case '+':
    case ')':
    case '{':
    case '}':
    case '|':
    case ']':
      return false;
    case '(':
      copyCurChar();
      translateRegExp();
      expect(')');
      copyCurChar();
      return true;
    case '\\':
      advance();
      parseEsc().output(result);
      return true;
    case '[':
      advance();
      parseCharClassExpr().output(result);
      return true;
    case '.':
      DOT.output(result);
      advance();
      return true;
    case '$':
    case '^':
      result.append('\\');
      break;
    }
    copyCurChar();
    return true;
  }


  static private CharClass makeCharClass(String categories, String includes, String excludeRanges) {
    List includeList = new Vector();
    for (int i = 0, len = categories.length(); i < len; i += 2)
      includeList.add(new Property(categories.substring(i, i + 2)));
    for (int i = 0, len = includes.length(); i < len; i++) {
      int j = i + 1;
      for (; j < len && includes.charAt(j) - includes.charAt(i) == j - i; j++)
        ;
      --j;
      if (i == j - 1)
        --j;
      if (i == j)
        includeList.add(new SingleChar(includes.charAt(i)));
      else
        includeList.add(new CharRange(includes.charAt(i), includes.charAt(j)));
      i = j;
    }
    List excludeList = new Vector();
    for (int i = 0, len = excludeRanges.length(); i < len; i += 2) {
      char min = excludeRanges.charAt(i);
      char max = excludeRanges.charAt(i + 1);
      if (min == max)
        excludeList.add(new SingleChar(min));
      else if (min == max - 1) {
        excludeList.add(new SingleChar(min));
        excludeList.add(new SingleChar(max));
      }
      else
        excludeList.add(new CharRange(min, max));
    }
    if (surrogatesDirect)
      excludeList.add(new CharRange(NONBMP_MIN, NONBMP_MAX)); // Unicode 4.0 adds some non-BMP letters
    return new Subtraction(new Union(includeList), new Union(excludeList));
  }

  private CharClass parseEsc() throws RegexSyntaxException {
    switch (curChar) {
    case 'n':
      advance();
      return new SingleChar('\n');
    case 'r':
      advance();
      return new SingleChar('\r');
    case 't':
      advance();
      return new SingleChar('\t');
    case '\\':
    case '|':
    case '.':
    case '-':
    case '^':
    case '?':
    case '*':
    case '+':
    case '(':
    case ')':
    case '{':
    case '}':
    case '[':
    case ']':
      break;
    case 's':
      advance();
      return ESC_s;
    case 'S':
      advance();
      return ESC_S;
    case 'i':
      advance();
      return ESC_i;
    case 'I':
      advance();
      return ESC_I;
    case 'c':
      advance();
      return ESC_c;
    case 'C':
      advance();
      return ESC_C;
    case 'd':
      advance();
      return ESC_d;
    case 'D':
      advance();
      return ESC_D;
    case 'w':
      advance();
      return ESC_w;
    case 'W':
      advance();
      return ESC_W;
    case 'p':
      advance();
      return parseProp();
    case 'P':
      advance();
      return new Complement(parseProp());
    default:
      throw makeException("bad_escape");
    }
    CharClass tem = new SingleChar(curChar);
    advance();
    return tem;
  }

  private CharClass parseProp() throws RegexSyntaxException {
    expect('{');
    int start = pos;
    for (;;) {
      advance();
      if (curChar == '}')
        break;
      if (!isAsciiAlnum(curChar) && curChar != '-')
        expect('}');
    }
    String propertyName = regExp.substring(start, pos - 1);
    advance();
    switch (propertyName.length()) {
    case 0:
      throw makeException("empty_property_name");
    case 2:
      int sci = subCategories.indexOf(propertyName);
      if (sci < 0 || sci % 2 == 1)
        throw makeException("bad_category");
      return getSubCategoryCharClass(sci / 2);
    case 1:
      int ci = categories.indexOf(propertyName.charAt(0));
      if (ci < 0)
        throw makeException("bad_category", propertyName);
      return getCategoryCharClass(ci);
    default:
      if (!propertyName.startsWith("Is"))
        break;
      String blockName = propertyName.substring(2);
      for (int i = 0; i < specialBlockNames.length; i++)
        if (blockName.equals(specialBlockNames[i]))
          return specialBlockCharClasses[i];
      if (!isBlock(blockName))
        throw makeException("bad_block_name", blockName);
      return new Property( "In" + blockName);
    }
    throw makeException("bad_property_name", propertyName);
  }

  static private boolean isBlock(String name) {
    for (int i = 0; i < blockNames.length; i++)
      if (name.equals(blockNames[i]))
        return true;
    return false;
  }

  static private boolean isAsciiAlnum(char c) {
    if ('a' <= c && c <= 'z')
      return true;
    if ('A' <= c && c <= 'Z')
      return true;
    if ('0' <= c && c <= '9')
      return true;
    return false;
  }

  private void expect(char c) throws RegexSyntaxException {
    if (curChar != c)
      throw makeException("expected", new String(new char[]{c}));
  }

  private CharClass parseCharClassExpr() throws RegexSyntaxException {
    boolean compl;
    if (curChar == '^') {
      advance();
      compl = true;
    }
    else
      compl = false;
    List members = new Vector();
    do {
      CharClass lower = parseCharClassEscOrXmlChar();
      members.add(lower);
      if (curChar == '-') {
        advance();
        if (curChar == '[')
          break;
        CharClass upper = parseCharClassEscOrXmlChar();
        if (lower.singleChar() < 0 || upper.singleChar() < 0)
          throw makeException("multi_range");
        if (lower.singleChar() > upper.singleChar())
          throw makeException("invalid_range");
        members.set(members.size() - 1,
                    new CharRange(lower.singleChar(), upper.singleChar()));
        if (curChar == '-') {
          advance();
          expect('[');
          break;
        }
      }
    } while (curChar != ']');
    CharClass result;
    if (members.size() == 1)
      result = (CharClass)members.get(0);
    else
      result = new Union(members);
    if (compl)
      result = new Complement(result);
    if (curChar == '[') {
      advance();
      result = new Subtraction(result, parseCharClassExpr());
      expect(']');
    }
    advance();
    return result;
  }

  private CharClass parseCharClassEscOrXmlChar() throws RegexSyntaxException {
    switch (curChar) {
    case EOS:
      if (eos)
        expect(']');
      break;
    case '\\':
      advance();
      return parseEsc();
    case '[':
    case ']':
    case '-':
      throw makeException("should_quote", new String(new char[]{curChar}));
    }
    CharClass tem;
    if (Utf16.isSurrogate(curChar)) {
      if (!Utf16.isSurrogate1(curChar))
        throw makeException("invalid_surrogate");
      char c1 = curChar;
      advance();
      if (!Utf16.isSurrogate2(curChar))
        throw makeException("invalid_surrogate");
      tem = new WideSingleChar(Utf16.scalarValue(c1, curChar));
    }
    else
      tem = new SingleChar(curChar);
    advance();
    return tem;
  }

  private RegexSyntaxException makeException(String key) {
    return new RegexSyntaxException(localizer.message(key), pos - 1);
  }

  private RegexSyntaxException makeException(String key, String arg) {
    return new RegexSyntaxException(localizer.message(key, arg), pos - 1);
  }

  static private boolean isJavaMetaChar(char c) {
    switch (c) {
    case '\\':
    case '^':
    case '?':
    case '*':
    case '+':
    case '(':
    case ')':
    case '{':
    case '}':
    case '|':
    case '[':
    case ']':
    case '-':
    case '&':
    case '$':
    case '.':
      return true;
    }
    return false;
  }

  static private synchronized CharClass getCategoryCharClass(int ci) {
    if (categoryCharClasses[ci] == null)
      categoryCharClasses[ci] = computeCategoryCharClass(categories.charAt(ci));
    return categoryCharClasses[ci];
  }

  static private synchronized CharClass getSubCategoryCharClass(int sci) {
    if (subCategoryCharClasses[sci] == null)
      subCategoryCharClasses[sci] = computeSubCategoryCharClass(subCategories.substring(sci * 2, (sci + 1) * 2));
    return subCategoryCharClasses[sci];
  }

  static private final char UNICODE_3_1_ADD_Lu = '\u03F4';   // added in 3.1
  static private final char UNICODE_3_1_ADD_Ll = '\u03F5';   // added in 3.1
  // 3 characters changed from No to Nl between 3.0 and 3.1
  static private final char UNICODE_3_1_CHANGE_No_to_Nl_MIN = '\u16EE';
  static private final char UNICODE_3_1_CHANGE_No_to_Nl_MAX = '\u16F0';
  static private final String CATEGORY_Pi = "\u00AB\u2018\u201B\u201C\u201F\u2039"; // Java doesn't know about category Pi
  static private final String CATEGORY_Pf = "\u00BB\u2019\u201D\u203A"; // Java doesn't know about category Pf

  static private CharClass computeCategoryCharClass(char code) {
    List classes = new Vector();
    classes.add(new Property(new String(new char[] { code })));
    if (!surrogatesDirect) {
      for (int ci = Categories.CATEGORY_NAMES.indexOf(code); ci >= 0; ci = Categories.CATEGORY_NAMES.indexOf(code, ci + 1)) {
        int[] addRanges = Categories.CATEGORY_RANGES[ci/2];
        for (int i = 0; i < addRanges.length; i += 2)
          classes.add(new CharRange(addRanges[i], addRanges[i + 1]));
      }
    }
    if (code == 'P')
      classes.add(makeCharClass(CATEGORY_Pi + CATEGORY_Pf));
    if (code == 'L') {
      classes.add(new SingleChar(UNICODE_3_1_ADD_Ll));
      classes.add(new SingleChar(UNICODE_3_1_ADD_Lu));
    }
    if (code == 'C')
      classes.add(computeSubCategoryCharClass("Cn")); // JDK 1.4 leaves Cn out of C?
    if (classes.size() == 1)
      return (CharClass)classes.get(0);
    return new Union(classes);
  }

  static private CharClass computeSubCategoryCharClass(String name) {
    if (name.equals("Pi"))
      return makeCharClass(CATEGORY_Pi);
    if (name.equals("Pf"))
      return makeCharClass(CATEGORY_Pf);
    CharClass base = new Property(name);
    if (name.equals("Cn")) {
      // Unassigned
      List assignedRanges = new Vector();
      assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Lu));
      assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Ll));
      if (!surrogatesDirect) {
        for (int i = 0; i < Categories.CATEGORY_RANGES.length; i++)
          for (int j = 0; j < Categories.CATEGORY_RANGES[i].length; j += 2)
            assignedRanges.add(new CharRange(Categories.CATEGORY_RANGES[i][j],
                                             Categories.CATEGORY_RANGES[i][j + 1]));
        base = new Union(new CharClass[] { base, new CharRange(NONBMP_MIN, NONBMP_MAX) });
      }
      return new Subtraction(base, new Union(assignedRanges));
    }
    List classes = new Vector();
    classes.add(base);
    if (!surrogatesDirect) {
      int sci = Categories.CATEGORY_NAMES.indexOf(name);
      if (sci >= 0) {
        int[] addRanges = Categories.CATEGORY_RANGES[sci/2];
        for (int i = 0; i < addRanges.length; i += 2)
          classes.add(new CharRange(addRanges[i], addRanges[i + 1]));
      }
    }
    if (name.equals("Lu"))
      classes.add(new SingleChar(UNICODE_3_1_ADD_Lu));
    else if (name.equals("Ll"))
      classes.add(new SingleChar(UNICODE_3_1_ADD_Ll));
    else if (name.equals("Nl"))
      classes.add(new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN, UNICODE_3_1_CHANGE_No_to_Nl_MAX));
    else if (name.equals("No"))
      return new Subtraction(new Union(classes),
                             new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN,
                                           UNICODE_3_1_CHANGE_No_to_Nl_MAX));
    if (classes.size() == 1)
      return base;
    return new Union(classes);
  }

  private static CharClass makeCharClass(String members) {
    List list = new Vector();
    for (int i = 0, len = members.length(); i < len; i++)
      list.add(new SingleChar(members.charAt(i)));
    return new Union(list);
  }

  public static void main(String[] args) throws RegexSyntaxException {
    String s = translate(args[0]);
    for (int i = 0, len = s.length(); i < len; i++) {
      char c = s.charAt(i);
      if (c >= 0x20 && c <= 0x7e)
        System.err.print(c);
      else {
        System.err.print("\\u");
        for (int shift = 12; shift >= 0; shift -= 4)
          System.err.print("0123456789ABCDEF".charAt((c >> shift) & 0xF));
      }
    }
    System.err.println();
  }
}

Generated by  Doxygen 1.6.0   Back to index