| /* |
| * Copyright 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| * use this file except in compliance with the License. You may obtain a copy of |
| * the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| * License for the specific language governing permissions and limitations under |
| * the License. |
| */ |
| package com.google.gwt.i18n.shared; |
| |
| import com.google.gwt.i18n.client.HasDirection.Direction; |
| import com.google.gwt.regexp.shared.RegExp; |
| import com.google.gwt.regexp.shared.SplitResult; |
| |
| /** |
| * Utility functions for performing common Bidi tests on strings. |
| */ |
| public class BidiUtils { |
| |
| /** |
| * A practical pattern to identify strong LTR characters. This pattern is not |
| * completely correct according to the Unicode standard. It is simplified |
| * for performance and small code size. |
| * <p> |
| * This is volatile to prevent the compiler from inlining this constant in |
| * various references below. |
| */ |
| private static volatile String LTR_CHARS = |
| "A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590\u0800-\u1FFF" + |
| "\u2C00-\uFB1C\uFDFE-\uFE6F\uFEFD-\uFFFF"; |
| |
| /** |
| * A practical pattern to identify strong RTL characters. This pattern is not |
| * completely correct according to the Unicode standard. It is simplified for |
| * performance and small code size. |
| * <p> |
| * This is volatile to prevent the compiler from inlining this constant in |
| * various references below. |
| */ |
| private static volatile String RTL_CHARS = |
| "\u0591-\u07FF\uFB1D-\uFDFD\uFE70-\uFEFC"; |
| |
| /** |
| * Regular expression to check if the first strongly directional character in |
| * a string is LTR. |
| */ |
| private static final RegExp FIRST_STRONG_IS_LTR_RE = |
| RegExp.compile("^[^" + RTL_CHARS + "]*[" + LTR_CHARS + ']'); |
| |
| /** |
| * Regular expression to check if the first strongly directional character in |
| * a string is RTL. |
| */ |
| private static final RegExp FIRST_STRONG_IS_RTL_RE = |
| RegExp.compile("^[^" + LTR_CHARS + "]*[" + RTL_CHARS + ']'); |
| |
| /** |
| * Regular expression to check if a string contains any LTR characters. |
| */ |
| private static final RegExp HAS_ANY_LTR_RE = |
| RegExp.compile("[" + LTR_CHARS + ']'); |
| |
| /** |
| * Regular expression to check if a string contains any RTL characters. |
| */ |
| private static final RegExp HAS_ANY_RTL_RE = |
| RegExp.compile("[" + RTL_CHARS + ']'); |
| |
| /** |
| * Regular expression to check if a string contains any numerals. Used to |
| * differentiate between completely neutral strings and those containing |
| * numbers, which are weakly LTR. |
| */ |
| private static final RegExp HAS_NUMERALS_RE = RegExp.compile("\\d"); |
| |
| /** |
| * Simplified regular expression for an HTML tag (opening or closing) or an |
| * HTML escape. We might want to skip over such expressions when estimating |
| * the text directionality. |
| */ |
| private static final RegExp SKIP_HTML_RE = |
| RegExp.compile("<[^>]*>|&[^;]+;", "g"); |
| |
| /** |
| * An instance of BidiUtils, to be returned by {@link #get()}. |
| */ |
| private static final BidiUtils INSTANCE = new BidiUtils(); |
| |
| /** |
| * Regular expression to check if a string looks like something that must |
| * always be LTR even in RTL text, e.g. a URL. When estimating the |
| * directionality of text containing these, we treat these as weakly LTR, like |
| * numbers. |
| */ |
| private static final RegExp IS_REQUIRED_LTR_RE = RegExp.compile("^http://.*"); |
| |
| /** |
| * Regular expressions to check if the last strongly-directional character in |
| * a piece of text is LTR. |
| */ |
| private static final RegExp LAST_STRONG_IS_LTR_RE = |
| RegExp.compile("[" + LTR_CHARS + "][^" + RTL_CHARS + "]*$"); |
| |
| /** |
| * Regular expressions to check if the last strongly-directional character in |
| * a piece of text is RTL. |
| */ |
| private static final RegExp LAST_STRONG_IS_RTL_RE = |
| RegExp.compile("[" + RTL_CHARS + "][^" + LTR_CHARS + "]*$"); |
| |
| /** |
| * This constant defines the threshold of RTL directionality. |
| */ |
| private static final float RTL_DETECTION_THRESHOLD = 0.40f; |
| |
| /** |
| * Regular expression to split a string into "words" for directionality |
| * estimation based on relative word counts. |
| */ |
| private static final RegExp WORD_SEPARATOR_RE = RegExp.compile("\\s+"); |
| |
| /** |
| * Get an instance of BidiUtils. |
| * @return An instance of BidiUtils |
| */ |
| public static BidiUtils get() { |
| return INSTANCE; |
| } |
| |
| /** |
| * Not instantiable. |
| */ |
| private BidiUtils() { |
| } |
| |
| /** |
| * Like {@link #endsWithLtr(String, boolean)}, but assumes {@code str} is not |
| * HTML / HTML-escaped. |
| */ |
| public boolean endsWithLtr(String str) { |
| return LAST_STRONG_IS_LTR_RE.test(str); |
| } |
| |
| /** |
| * Check whether the last strongly-directional character in the string is LTR. |
| * @param str the string to check |
| * @param isHtml whether str is HTML / HTML-escaped |
| * @return whether LTR exit directionality was detected |
| */ |
| public boolean endsWithLtr(String str, boolean isHtml) { |
| return endsWithLtr(stripHtmlIfNeeded(str, isHtml)); |
| } |
| |
| /** |
| * Like {@link #endsWithRtl(String, boolean)}, but assumes {@code str} is not |
| * HTML / HTML-escaped. |
| */ |
| public boolean endsWithRtl(String str) { |
| return LAST_STRONG_IS_RTL_RE.test(str); |
| } |
| |
| /** |
| * Check whether the last strongly-directional character in the string is RTL. |
| * @param str the string to check |
| * @param isHtml whether str is HTML / HTML-escaped |
| * @return whether RTL exit directionality was detected |
| */ |
| public boolean endsWithRtl(String str, boolean isHtml) { |
| return endsWithRtl(stripHtmlIfNeeded(str, isHtml)); |
| } |
| |
| /** |
| * Like {@link #estimateDirection(String, boolean)}, but assumes {@code str} |
| * is not HTML / HTML-escaped. |
| */ |
| public Direction estimateDirection(String str) { |
| int rtlCount = 0; |
| int total = 0; |
| boolean hasWeaklyLtr = false; |
| SplitResult tokens = WORD_SEPARATOR_RE.split(str); |
| for (int i = 0; i < tokens.length(); i++) { |
| String token = tokens.get(i); |
| if (startsWithRtl(token)) { |
| rtlCount++; |
| total++; |
| } else if (IS_REQUIRED_LTR_RE.test(token)) { |
| hasWeaklyLtr = true; |
| } else if (hasAnyLtr(token)) { |
| total++; |
| } else if (HAS_NUMERALS_RE.test(token)) { |
| hasWeaklyLtr = true; |
| } |
| } |
| |
| return total == 0 ? (hasWeaklyLtr ? Direction.LTR : Direction.DEFAULT) |
| : ((float) rtlCount / total > RTL_DETECTION_THRESHOLD ? Direction.RTL : |
| Direction.LTR); |
| } |
| |
| /** |
| * Estimates the directionality of a string based on relative word counts. |
| * If the number of RTL words is above a certain percentage of the total |
| * number of strongly directional words, returns RTL. |
| * Otherwise, if any words are strongly or weakly LTR, returns LTR. |
| * Otherwise, returns DEFAULT, which is used to mean "neutral". |
| * Numbers are counted as weakly LTR. |
| * @param str the string to check |
| * @param isHtml whether {@code str} is HTML / HTML-escaped. Use this to |
| * ignore HTML tags and escapes that would otherwise be mistaken for |
| * LTR text. |
| * @return the string's directionality |
| */ |
| public Direction estimateDirection(String str, boolean isHtml) { |
| return estimateDirection(stripHtmlIfNeeded(str, isHtml)); |
| } |
| |
| /** |
| * Like {@link #hasAnyLtr(String, boolean)}, but assumes {@code str} is not |
| * HTML / HTML-escaped. |
| * @param str the string to be tested |
| * @return whether the string contains any LTR characters |
| */ |
| public boolean hasAnyLtr(String str) { |
| return HAS_ANY_LTR_RE.test(str); |
| } |
| |
| /** |
| * Checks if the given string has any LTR characters in it. |
| * @param str the string to be tested |
| * @param isHtml whether str is HTML / HTML-escaped |
| * @return whether the string contains any LTR characters |
| */ |
| public boolean hasAnyLtr(String str, boolean isHtml) { |
| return hasAnyLtr(stripHtmlIfNeeded(str, isHtml)); |
| } |
| |
| /** |
| * Like {@link #hasAnyRtl(String, boolean)}, but assumes {@code str} is not |
| * HTML / HTML-escaped. |
| * @param str the string to be tested |
| * @return whether the string contains any RTL characters |
| */ |
| public boolean hasAnyRtl(String str) { |
| return HAS_ANY_RTL_RE.test(str); |
| } |
| |
| /** |
| * Checks if the given string has any RTL characters in it. |
| * @param isHtml whether str is HTML / HTML-escaped |
| * @param str the string to be tested |
| * @return whether the string contains any RTL characters |
| */ |
| public boolean hasAnyRtl(String str, boolean isHtml) { |
| return hasAnyRtl(stripHtmlIfNeeded(str, isHtml)); |
| } |
| |
| /** |
| * Like {@link #startsWithLtr(String, boolean)}, but assumes {@code str} is |
| * not HTML / HTML-escaped. |
| */ |
| public boolean startsWithLtr(String str) { |
| return FIRST_STRONG_IS_LTR_RE.test(str); |
| } |
| |
| /** |
| * Check whether the first strongly-directional character in the string is |
| * LTR. |
| * @param str the string to check |
| * @param isHtml whether str is HTML / HTML-escaped |
| * @return whether LTR exit directionality was detected |
| */ |
| public boolean startsWithLtr(String str, boolean isHtml) { |
| return startsWithLtr(stripHtmlIfNeeded(str, isHtml)); |
| } |
| |
| /** |
| * Like {@link #startsWithRtl(String, boolean)}, but assumes {@code str} is |
| * not HTML / HTML-escaped. |
| */ |
| public boolean startsWithRtl(String str) { |
| return FIRST_STRONG_IS_RTL_RE.test(str); |
| } |
| |
| /** |
| * Check whether the first strongly-directional character in the string is |
| * RTL. |
| * @param str the string to check |
| * @param isHtml whether {@code str} is HTML / HTML-escaped |
| * @return whether RTL exit directionality was detected |
| */ |
| public boolean startsWithRtl(String str, boolean isHtml) { |
| return startsWithRtl(stripHtmlIfNeeded(str, isHtml)); |
| } |
| |
| /** |
| * Returns the input text with spaces instead of HTML tags or HTML escapes, if |
| * isStripNeeded is true. Else returns the input as is. |
| * Useful for text directionality estimation. |
| * Note: the function should not be used in other contexts; it is not 100% |
| * correct, but rather a good-enough implementation for directionality |
| * estimation purposes. |
| */ |
| String stripHtmlIfNeeded(String str, boolean isStripNeeded) { |
| return isStripNeeded ? SKIP_HTML_RE.replace(str, " ") : str; |
| } |
| } |