blob: 8a124efa1bc204455b49584c677b9b2bf58939b7 [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.gwt.i18n.shared;
import com.google.gwt.i18n.client.HasDirection.Direction;
import com.google.gwt.regexp.shared.RegExp;
import com.google.gwt.regexp.shared.SplitResult;
/**
* Utility functions for performing common Bidi tests on strings.
*/
public class BidiUtils {
/**
* A practical pattern to identify strong LTR characters. This pattern is not
* completely correct according to the Unicode standard. It is simplified
* for performance and small code size.
* <p>
* This is volatile to prevent the compiler from inlining this constant in
* various references below.
*/
private static volatile String LTR_CHARS =
"A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590\u0800-\u1FFF" +
"\u2C00-\uFB1C\uFDFE-\uFE6F\uFEFD-\uFFFF";
/**
* A practical pattern to identify strong RTL characters. This pattern is not
* completely correct according to the Unicode standard. It is simplified for
* performance and small code size.
* <p>
* This is volatile to prevent the compiler from inlining this constant in
* various references below.
*/
private static volatile String RTL_CHARS =
"\u0591-\u07FF\uFB1D-\uFDFD\uFE70-\uFEFC";
/**
* Regular expression to check if the first strongly directional character in
* a string is LTR.
*/
private static final RegExp FIRST_STRONG_IS_LTR_RE =
RegExp.compile("^[^" + RTL_CHARS + "]*[" + LTR_CHARS + ']');
/**
* Regular expression to check if the first strongly directional character in
* a string is RTL.
*/
private static final RegExp FIRST_STRONG_IS_RTL_RE =
RegExp.compile("^[^" + LTR_CHARS + "]*[" + RTL_CHARS + ']');
/**
* Regular expression to check if a string contains any LTR characters.
*/
private static final RegExp HAS_ANY_LTR_RE =
RegExp.compile("[" + LTR_CHARS + ']');
/**
* Regular expression to check if a string contains any RTL characters.
*/
private static final RegExp HAS_ANY_RTL_RE =
RegExp.compile("[" + RTL_CHARS + ']');
/**
* Regular expression to check if a string contains any numerals. Used to
* differentiate between completely neutral strings and those containing
* numbers, which are weakly LTR.
*/
private static final RegExp HAS_NUMERALS_RE = RegExp.compile("\\d");
/**
* Simplified regular expression for an HTML tag (opening or closing) or an
* HTML escape. We might want to skip over such expressions when estimating
* the text directionality.
*/
private static final RegExp SKIP_HTML_RE =
RegExp.compile("<[^>]*>|&[^;]+;", "g");
/**
* An instance of BidiUtils, to be returned by {@link #get()}.
*/
private static final BidiUtils INSTANCE = new BidiUtils();
/**
* Regular expression to check if a string looks like something that must
* always be LTR even in RTL text, e.g. a URL. When estimating the
* directionality of text containing these, we treat these as weakly LTR, like
* numbers.
*/
private static final RegExp IS_REQUIRED_LTR_RE = RegExp.compile("^http://.*");
/**
* Regular expressions to check if the last strongly-directional character in
* a piece of text is LTR.
*/
private static final RegExp LAST_STRONG_IS_LTR_RE =
RegExp.compile("[" + LTR_CHARS + "][^" + RTL_CHARS + "]*$");
/**
* Regular expressions to check if the last strongly-directional character in
* a piece of text is RTL.
*/
private static final RegExp LAST_STRONG_IS_RTL_RE =
RegExp.compile("[" + RTL_CHARS + "][^" + LTR_CHARS + "]*$");
/**
* This constant defines the threshold of RTL directionality.
*/
private static final float RTL_DETECTION_THRESHOLD = 0.40f;
/**
* Regular expression to split a string into "words" for directionality
* estimation based on relative word counts.
*/
private static final RegExp WORD_SEPARATOR_RE = RegExp.compile("\\s+");
/**
* Get an instance of BidiUtils.
* @return An instance of BidiUtils
*/
public static BidiUtils get() {
return INSTANCE;
}
/**
* Not instantiable.
*/
private BidiUtils() {
}
/**
* Like {@link #endsWithLtr(String, boolean)}, but assumes {@code str} is not
* HTML / HTML-escaped.
*/
public boolean endsWithLtr(String str) {
return LAST_STRONG_IS_LTR_RE.test(str);
}
/**
* Check whether the last strongly-directional character in the string is LTR.
* @param str the string to check
* @param isHtml whether str is HTML / HTML-escaped
* @return whether LTR exit directionality was detected
*/
public boolean endsWithLtr(String str, boolean isHtml) {
return endsWithLtr(stripHtmlIfNeeded(str, isHtml));
}
/**
* Like {@link #endsWithRtl(String, boolean)}, but assumes {@code str} is not
* HTML / HTML-escaped.
*/
public boolean endsWithRtl(String str) {
return LAST_STRONG_IS_RTL_RE.test(str);
}
/**
* Check whether the last strongly-directional character in the string is RTL.
* @param str the string to check
* @param isHtml whether str is HTML / HTML-escaped
* @return whether RTL exit directionality was detected
*/
public boolean endsWithRtl(String str, boolean isHtml) {
return endsWithRtl(stripHtmlIfNeeded(str, isHtml));
}
/**
* Like {@link #estimateDirection(String, boolean)}, but assumes {@code str}
* is not HTML / HTML-escaped.
*/
public Direction estimateDirection(String str) {
int rtlCount = 0;
int total = 0;
boolean hasWeaklyLtr = false;
SplitResult tokens = WORD_SEPARATOR_RE.split(str);
for (int i = 0; i < tokens.length(); i++) {
String token = tokens.get(i);
if (startsWithRtl(token)) {
rtlCount++;
total++;
} else if (IS_REQUIRED_LTR_RE.test(token)) {
hasWeaklyLtr = true;
} else if (hasAnyLtr(token)) {
total++;
} else if (HAS_NUMERALS_RE.test(token)) {
hasWeaklyLtr = true;
}
}
return total == 0 ? (hasWeaklyLtr ? Direction.LTR : Direction.DEFAULT)
: ((float) rtlCount / total > RTL_DETECTION_THRESHOLD ? Direction.RTL :
Direction.LTR);
}
/**
* Estimates the directionality of a string based on relative word counts.
* If the number of RTL words is above a certain percentage of the total
* number of strongly directional words, returns RTL.
* Otherwise, if any words are strongly or weakly LTR, returns LTR.
* Otherwise, returns DEFAULT, which is used to mean "neutral".
* Numbers are counted as weakly LTR.
* @param str the string to check
* @param isHtml whether {@code str} is HTML / HTML-escaped. Use this to
* ignore HTML tags and escapes that would otherwise be mistaken for
* LTR text.
* @return the string's directionality
*/
public Direction estimateDirection(String str, boolean isHtml) {
return estimateDirection(stripHtmlIfNeeded(str, isHtml));
}
/**
* Like {@link #hasAnyLtr(String, boolean)}, but assumes {@code str} is not
* HTML / HTML-escaped.
* @param str the string to be tested
* @return whether the string contains any LTR characters
*/
public boolean hasAnyLtr(String str) {
return HAS_ANY_LTR_RE.test(str);
}
/**
* Checks if the given string has any LTR characters in it.
* @param str the string to be tested
* @param isHtml whether str is HTML / HTML-escaped
* @return whether the string contains any LTR characters
*/
public boolean hasAnyLtr(String str, boolean isHtml) {
return hasAnyLtr(stripHtmlIfNeeded(str, isHtml));
}
/**
* Like {@link #hasAnyRtl(String, boolean)}, but assumes {@code str} is not
* HTML / HTML-escaped.
* @param str the string to be tested
* @return whether the string contains any RTL characters
*/
public boolean hasAnyRtl(String str) {
return HAS_ANY_RTL_RE.test(str);
}
/**
* Checks if the given string has any RTL characters in it.
* @param isHtml whether str is HTML / HTML-escaped
* @param str the string to be tested
* @return whether the string contains any RTL characters
*/
public boolean hasAnyRtl(String str, boolean isHtml) {
return hasAnyRtl(stripHtmlIfNeeded(str, isHtml));
}
/**
* Like {@link #startsWithLtr(String, boolean)}, but assumes {@code str} is
* not HTML / HTML-escaped.
*/
public boolean startsWithLtr(String str) {
return FIRST_STRONG_IS_LTR_RE.test(str);
}
/**
* Check whether the first strongly-directional character in the string is
* LTR.
* @param str the string to check
* @param isHtml whether str is HTML / HTML-escaped
* @return whether LTR exit directionality was detected
*/
public boolean startsWithLtr(String str, boolean isHtml) {
return startsWithLtr(stripHtmlIfNeeded(str, isHtml));
}
/**
* Like {@link #startsWithRtl(String, boolean)}, but assumes {@code str} is
* not HTML / HTML-escaped.
*/
public boolean startsWithRtl(String str) {
return FIRST_STRONG_IS_RTL_RE.test(str);
}
/**
* Check whether the first strongly-directional character in the string is
* RTL.
* @param str the string to check
* @param isHtml whether {@code str} is HTML / HTML-escaped
* @return whether RTL exit directionality was detected
*/
public boolean startsWithRtl(String str, boolean isHtml) {
return startsWithRtl(stripHtmlIfNeeded(str, isHtml));
}
/**
* Returns the input text with spaces instead of HTML tags or HTML escapes, if
* isStripNeeded is true. Else returns the input as is.
* Useful for text directionality estimation.
* Note: the function should not be used in other contexts; it is not 100%
* correct, but rather a good-enough implementation for directionality
* estimation purposes.
*/
String stripHtmlIfNeeded(String str, boolean isStripNeeded) {
return isStripNeeded ? SKIP_HTML_RE.replace(str, " ") : str;
}
}