blob: ddde0fa3f4d4a18aaa4c3cbefab7eda1f55d946c [file] [log] [blame]
/*
* Copyright 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package javaemul.internal;
import java.nio.charset.Charset;
/**
* Provides Charset implementations.
*/
public abstract class EmulatedCharset extends Charset {
public static final EmulatedCharset UTF_8 = new UtfCharset("UTF-8");
public static final EmulatedCharset ISO_LATIN_1 = new LatinCharset("ISO-LATIN-1");
public static final EmulatedCharset ISO_8859_1 = new LatinCharset("ISO-8859-1");
private static class LatinCharset extends EmulatedCharset {
public LatinCharset(String name) {
super(name);
}
@Override
public byte[] getBytes(String str) {
int n = str.length();
byte[] bytes = new byte[n];
for (int i = 0; i < n; ++i) {
bytes[i] = (byte) (str.charAt(i) & 255);
}
return bytes;
}
@Override
public char[] decodeString(byte[] bytes, int ofs, int len) {
char[] chars = new char[len];
for (int i = 0; i < len; ++i) {
chars[i] = (char) (bytes[ofs + i] & 255);
}
return chars;
}
}
private static class UtfCharset extends EmulatedCharset {
public UtfCharset(String name) {
super(name);
}
@Override
public char[] decodeString(byte[] bytes, int ofs, int len) {
// TODO(jat): consider using decodeURIComponent(escape(bytes)) instead
int charCount = 0;
for (int i = 0; i < len; ) {
++charCount;
byte ch = bytes[ofs + i];
if ((ch & 0xC0) == 0x80) {
throw new IllegalArgumentException("Invalid UTF8 sequence");
} else if ((ch & 0x80) == 0) {
++i;
} else if ((ch & 0xE0) == 0xC0) {
i += 2;
} else if ((ch & 0xF0) == 0xE0) {
i += 3;
} else if ((ch & 0xF8) == 0xF0) {
i += 4;
} else {
// no 5+ byte sequences since max codepoint is less than 2^21
throw new IllegalArgumentException("Invalid UTF8 sequence");
}
if (i > len) {
throw new IndexOutOfBoundsException("Invalid UTF8 sequence");
}
}
char[] chars = new char[charCount];
int outIdx = 0;
int count = 0;
for (int i = 0; i < len; ) {
int ch = bytes[ofs + i++];
if ((ch & 0x80) == 0) {
count = 1;
ch &= 127;
} else if ((ch & 0xE0) == 0xC0) {
count = 2;
ch &= 31;
} else if ((ch & 0xF0) == 0xE0) {
count = 3;
ch &= 15;
} else if ((ch & 0xF8) == 0xF0) {
count = 4;
ch &= 7;
} else if ((ch & 0xFC) == 0xF8) {
count = 5;
ch &= 3;
}
while (--count > 0) {
byte b = bytes[ofs + i++];
if ((b & 0xC0) != 0x80) {
throw new IllegalArgumentException("Invalid UTF8 sequence at "
+ (ofs + i - 1) + ", byte=" + Integer.toHexString(b));
}
ch = (ch << 6) | (b & 63);
}
outIdx += Character.toChars(ch, chars, outIdx);
}
return chars;
}
@Override
public byte[] getBytes(String str) {
// TODO(jat): consider using unescape(encodeURIComponent(bytes)) instead
int n = str.length();
int byteCount = 0;
for (int i = 0; i < n;) {
int ch = str.codePointAt(i);
i += Character.charCount(ch);
if (ch < (1 << 7)) {
byteCount++;
} else if (ch < (1 << 11)) {
byteCount += 2;
} else if (ch < (1 << 16)) {
byteCount += 3;
} else if (ch < (1 << 21)) {
byteCount += 4;
} else if (ch < (1 << 26)) {
byteCount += 5;
}
}
byte[] bytes = new byte[byteCount];
int out = 0;
for (int i = 0; i < n;) {
int ch = str.codePointAt(i);
i += Character.charCount(ch);
out += encodeUtf8(bytes, out, ch);
}
return bytes;
}
/**
* Encode a single character in UTF8.
*
* @param bytes byte array to store character in
* @param ofs offset into byte array to store first byte
* @param codePoint character to encode
* @return number of bytes consumed by encoding the character
* @throws IllegalArgumentException if codepoint >= 2^26
*/
private int encodeUtf8(byte[] bytes, int ofs, int codePoint) {
if (codePoint < (1 << 7)) {
bytes[ofs] = (byte) (codePoint & 127);
return 1;
} else if (codePoint < (1 << 11)) {
// 110xxxxx 10xxxxxx
bytes[ofs++] = (byte) (((codePoint >> 6) & 31) | 0xC0);
bytes[ofs] = (byte) ((codePoint & 63) | 0x80);
return 2;
} else if (codePoint < (1 << 16)) {
// 1110xxxx 10xxxxxx 10xxxxxx
bytes[ofs++] = (byte) (((codePoint >> 12) & 15) | 0xE0);
bytes[ofs++] = (byte) (((codePoint >> 6) & 63) | 0x80);
bytes[ofs] = (byte) ((codePoint & 63) | 0x80);
return 3;
} else if (codePoint < (1 << 21)) {
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
bytes[ofs++] = (byte) (((codePoint >> 18) & 7) | 0xF0);
bytes[ofs++] = (byte) (((codePoint >> 12) & 63) | 0x80);
bytes[ofs++] = (byte) (((codePoint >> 6) & 63) | 0x80);
bytes[ofs] = (byte) ((codePoint & 63) | 0x80);
return 4;
} else if (codePoint < (1 << 26)) {
// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
bytes[ofs++] = (byte) (((codePoint >> 24) & 3) | 0xF8);
bytes[ofs++] = (byte) (((codePoint >> 18) & 63) | 0x80);
bytes[ofs++] = (byte) (((codePoint >> 12) & 63) | 0x80);
bytes[ofs++] = (byte) (((codePoint >> 6) & 63) | 0x80);
bytes[ofs] = (byte) ((codePoint & 63) | 0x80);
return 5;
}
throw new IllegalArgumentException("Character out of range: " + codePoint);
}
}
public EmulatedCharset(String name) {
super(name, null);
}
public abstract byte[] getBytes(String string);
public abstract char[] decodeString(byte[] bytes, int ofs, int len);
}