blob: 3733f734ef5859245de98e531506c6acb513a25b [file] [log] [blame]
/*
* Copyright 2008 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package java.lang;
import static javaemul.internal.InternalPreconditions.checkCriticalArgument;
import java.io.Serializable;
import javaemul.internal.NativeRegExp;
/**
* Wraps a native <code>char</code> as an object.
*
* TODO(jat): many of the classification methods implemented here are not
* correct in that they only handle ASCII characters, and many other methods
* are not currently implemented. I think the proper approach is to introduce * a deferred binding parameter which substitutes an implementation using
* a fully-correct Unicode character database, at the expense of additional
* data being downloaded. That way developers that need the functionality
* can get it without those who don't need it paying for it.
*
* <pre>
* The following methods are still not implemented -- most would require Unicode
* character db to be useful:
* - digit / is* / to*(int codePoint)
* - isDefined(char)
* - isIdentifierIgnorable(char)
* - isJavaIdentifierPart(char)
* - isJavaIdentifierStart(char)
* - isJavaLetter(char) -- deprecated, so probably not
* - isJavaLetterOrDigit(char) -- deprecated, so probably not
* - isISOControl(char)
* - isMirrored(char)
* - isSpaceChar(char)
* - isUnicodeIdentifierPart(char)
* - isUnicodeIdentifierStart(char)
* - getDirectionality(*)
* - getNumericValue(*)
* - getType(*)
* - reverseBytes(char) -- any use for this at all in the browser?
* - toTitleCase(*)
* - all the category constants for classification
*
* The following do not properly handle characters outside of ASCII:
* - digit(char c, int radix)
* - isDigit(char c)
* - isLetter(char c)
* - isLetterOrDigit(char c)
* - isLowerCase(char c)
* - isUpperCase(char c)
* </pre>
*/
public final class Character implements Comparable<Character>, Serializable {
/**
* Helper class to share code between implementations, by making a char
* array look like a CharSequence.
*/
static class CharSequenceAdapter implements CharSequence {
private char[] charArray;
private int start;
private int end;
public CharSequenceAdapter(char[] charArray) {
this(charArray, 0, charArray.length);
}
public CharSequenceAdapter(char[] charArray, int start, int end) {
this.charArray = charArray;
this.start = start;
this.end = end;
}
@Override
public char charAt(int index) {
return charArray[index + start];
}
@Override
public int length() {
return end - start;
}
@Override
public java.lang.CharSequence subSequence(int start, int end) {
return new CharSequenceAdapter(charArray, this.start + start,
this.start + end);
}
}
/**
* Use nested class to avoid clinit on outer.
*/
private static class BoxedValues {
// Box values according to JLS - from \u0000 to \u007f
private static Character[] boxedValues = new Character[128];
}
public static final Class<Character> TYPE = Character.class;
public static final int MIN_RADIX = 2;
public static final int MAX_RADIX = 36;
public static final char MIN_VALUE = '\u0000';
public static final char MAX_VALUE = '\uFFFF';
public static final char MIN_SURROGATE = '\uD800';
public static final char MAX_SURROGATE = '\uDFFF';
public static final char MIN_LOW_SURROGATE = '\uDC00';
public static final char MAX_LOW_SURROGATE = '\uDFFF';
public static final char MIN_HIGH_SURROGATE = '\uD800';
public static final char MAX_HIGH_SURROGATE = '\uDBFF';
public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
public static final int MIN_CODE_POINT = 0x0000;
public static final int MAX_CODE_POINT = 0x10FFFF;
public static final int SIZE = 16;
public static final int BYTES = SIZE / Byte.SIZE;
public static int charCount(int codePoint) {
return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT ? 2 : 1;
}
public static int codePointAt(char[] a, int index) {
return codePointAt(new CharSequenceAdapter(a), index, a.length);
}
public static int codePointAt(char[] a, int index, int limit) {
return codePointAt(new CharSequenceAdapter(a), index, limit);
}
public static int codePointAt(CharSequence seq, int index) {
return codePointAt(seq, index, seq.length());
}
public static int codePointBefore(char[] a, int index) {
return codePointBefore(new CharSequenceAdapter(a), index, 0);
}
public static int codePointBefore(char[] a, int index, int start) {
return codePointBefore(new CharSequenceAdapter(a), index, start);
}
public static int codePointBefore(CharSequence cs, int index) {
return codePointBefore(cs, index, 0);
}
public static int codePointCount(char[] a, int offset, int count) {
return codePointCount(new CharSequenceAdapter(a), offset, offset + count);
}
public static int codePointCount(CharSequence seq, int beginIndex,
int endIndex) {
int count = 0;
for (int idx = beginIndex; idx < endIndex; ) {
char ch = seq.charAt(idx++);
if (isHighSurrogate(ch) && idx < endIndex
&& (isLowSurrogate(seq.charAt(idx)))) {
// skip the second char of surrogate pairs
++idx;
}
++count;
}
return count;
}
public static int compare(char x, char y) {
// JLS specifies that the chars are promoted to int before subtraction.
return x - y;
}
/*
* TODO: correct Unicode handling.
*/
public static int digit(char c, int radix) {
if (radix < MIN_RADIX || radix > MAX_RADIX) {
return -1;
}
if (c >= '0' && c < '0' + Math.min(radix, 10)) {
return c - '0';
}
// The offset by 10 is to re-base the alpha values
if (c >= 'a' && c < (radix + 'a' - 10)) {
return c - 'a' + 10;
}
if (c >= 'A' && c < (radix + 'A' - 10)) {
return c - 'A' + 10;
}
return -1;
}
public static char forDigit(int digit, int radix) {
if (radix < MIN_RADIX || radix > MAX_RADIX) {
return 0;
}
if (digit < 0 || digit >= radix) {
return 0;
}
return forDigit(digit);
}
public static int hashCode(char c) {
return c;
}
public static boolean isBmpCodePoint(int codePoint) {
return codePoint >= MIN_VALUE && codePoint <= MAX_VALUE;
}
private static NativeRegExp digitRegex;
/*
* TODO: correct Unicode handling.
*/
public static boolean isDigit(char c) {
if (digitRegex == null) {
digitRegex = new NativeRegExp("\\d");
}
return digitRegex.test(String.valueOf(c));
}
public static boolean isHighSurrogate(char ch) {
return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
}
private static NativeRegExp leterRegex;
/*
* TODO: correct Unicode handling.
*/
public static boolean isLetter(char c) {
if (leterRegex == null) {
leterRegex = new NativeRegExp("[A-Z]", "i");
}
return leterRegex.test(String.valueOf(c));
}
private static NativeRegExp isLeterOrDigitRegex;
/*
* TODO: correct Unicode handling.
*/
public static boolean isLetterOrDigit(char c) {
if (isLeterOrDigitRegex == null) {
isLeterOrDigitRegex = new NativeRegExp("[A-Z\\d]", "i");
}
return isLeterOrDigitRegex.test(String.valueOf(c));
}
/*
* TODO: correct Unicode handling.
*/
public static boolean isLowerCase(char c) {
return toLowerCase(c) == c && isLetter(c);
}
public static boolean isLowSurrogate(char ch) {
return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
}
/**
* Deprecated - see isWhitespace(char).
*/
@Deprecated
public static boolean isSpace(char c) {
switch (c) {
case ' ':
return true;
case '\n':
return true;
case '\t':
return true;
case '\f':
return true;
case '\r':
return true;
default:
return false;
}
}
public static boolean isWhitespace(char ch) {
return isWhitespace(String.valueOf(ch));
}
public static boolean isWhitespace(int codePoint) {
return isWhitespace(String.fromCodePoint(codePoint));
}
private static NativeRegExp whitespaceRegex;
private static boolean isWhitespace(String ch) {
if (whitespaceRegex == null) {
// The regex would just be /\s/, but browsers handle non-breaking spaces inconsistently. Also,
// the Java definition includes separators.
whitespaceRegex =
new NativeRegExp(
"[\\u1680\\u180E\\u2000-\\u2006\\u2008-\\u200A\\u2028\\u2029\\u205F\\u3000\\uFEFF]"
+ "|[\\t-\\r ]"
+ "|[\\x1C-\\x1F]");
}
return whitespaceRegex.test(ch);
}
public static boolean isSupplementaryCodePoint(int codePoint) {
return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT && codePoint <= MAX_CODE_POINT;
}
public static boolean isSurrogatePair(char highSurrogate, char lowSurrogate) {
return isHighSurrogate(highSurrogate) && isLowSurrogate(lowSurrogate);
}
public static boolean isTitleCase(char c) {
// https://www.compart.com/en/unicode/category/Lt
return c != toUpperCase(c) && c != toLowerCase(c);
}
/*
* TODO: correct Unicode handling.
*/
public static boolean isUpperCase(char c) {
return toUpperCase(c) == c && isLetter(c);
}
public static boolean isValidCodePoint(int codePoint) {
return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
}
public static int offsetByCodePoints(char[] a, int start, int count, int index,
int codePointOffset) {
return offsetByCodePoints(new CharSequenceAdapter(a, start, count), index,
codePointOffset);
}
public static int offsetByCodePoints(CharSequence seq, int index,
int codePointOffset) {
if (codePointOffset < 0) {
// move backwards
while (codePointOffset < 0) {
--index;
if (Character.isLowSurrogate(seq.charAt(index))
&& Character.isHighSurrogate(seq.charAt(index - 1))) {
--index;
}
++codePointOffset;
}
} else {
// move forwards
while (codePointOffset > 0) {
if (Character.isHighSurrogate(seq.charAt(index))
&& Character.isLowSurrogate(seq.charAt(index + 1))) {
++index;
}
++index;
--codePointOffset;
}
}
return index;
}
public static char[] toChars(int codePoint) {
checkCriticalArgument(codePoint >= 0 && codePoint <= MAX_CODE_POINT);
if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) {
return new char[] {
getHighSurrogate(codePoint),
getLowSurrogate(codePoint),
};
} else {
return new char[] {
(char) codePoint,
};
}
}
public static int toChars(int codePoint, char[] dst, int dstIndex) {
checkCriticalArgument(codePoint >= 0 && codePoint <= MAX_CODE_POINT);
if (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) {
dst[dstIndex++] = getHighSurrogate(codePoint);
dst[dstIndex] = getLowSurrogate(codePoint);
return 2;
} else {
dst[dstIndex] = (char) codePoint;
return 1;
}
}
public static int toCodePoint(char highSurrogate, char lowSurrogate) {
/*
* High and low surrogate chars have the bottom 10 bits to store the value
* above MIN_SUPPLEMENTARY_CODE_POINT, so grab those bits and add the
* offset.
*/
return MIN_SUPPLEMENTARY_CODE_POINT + ((highSurrogate & 1023) << 10) + (lowSurrogate & 1023);
}
public static char toLowerCase(char c) {
return CaseMapper.charToLowerCase(c);
}
public static String toString(char x) {
return String.valueOf(x);
}
public static char toUpperCase(char c) {
return CaseMapper.charToUpperCase(c);
}
public static Character valueOf(char c) {
if (c < 128) {
Character result = BoxedValues.boxedValues[c];
if (result == null) {
result = BoxedValues.boxedValues[c] = new Character(c);
}
return result;
}
return new Character(c);
}
static int codePointAt(CharSequence cs, int index, int limit) {
char hiSurrogate = cs.charAt(index++);
char loSurrogate;
if (Character.isHighSurrogate(hiSurrogate) && index < limit
&& Character.isLowSurrogate(loSurrogate = cs.charAt(index))) {
return Character.toCodePoint(hiSurrogate, loSurrogate);
}
return hiSurrogate;
}
static int codePointBefore(CharSequence cs, int index, int start) {
char loSurrogate = cs.charAt(--index);
char highSurrogate;
if (isLowSurrogate(loSurrogate) && index > start
&& isHighSurrogate(highSurrogate = cs.charAt(index - 1))) {
return toCodePoint(highSurrogate, loSurrogate);
}
return loSurrogate;
}
/**
* Shared implementation with {@link Long#toString}.
*
* @skip
*/
static char forDigit(int digit) {
final int overBaseTen = digit - 10;
return (char) (overBaseTen < 0 ? '0' + digit : 'a' + overBaseTen);
}
/**
* Computes the high surrogate character of the UTF16 representation of a
* non-BMP code point. See {@link getLowSurrogate}.
*
* @param codePoint requested codePoint, required to be >=
* MIN_SUPPLEMENTARY_CODE_POINT
* @return high surrogate character
*/
static char getHighSurrogate(int codePoint) {
return (char) (MIN_HIGH_SURROGATE
+ (((codePoint - MIN_SUPPLEMENTARY_CODE_POINT) >> 10) & 1023));
}
/**
* Computes the low surrogate character of the UTF16 representation of a
* non-BMP code point. See {@link getHighSurrogate}.
*
* @param codePoint requested codePoint, required to be >=
* MIN_SUPPLEMENTARY_CODE_POINT
* @return low surrogate character
*/
static char getLowSurrogate(int codePoint) {
return (char) (MIN_LOW_SURROGATE + ((codePoint - MIN_SUPPLEMENTARY_CODE_POINT) & 1023));
}
private final transient char value;
public Character(char value) {
this.value = value;
}
public char charValue() {
return value;
}
@Override
public int compareTo(Character c) {
return compare(value, c.value);
}
@Override
public boolean equals(Object o) {
return (o instanceof Character) && (((Character) o).value == value);
}
@Override
public int hashCode() {
return hashCode(value);
}
@Override
public String toString() {
return String.valueOf(value);
}
}