Integrate new, faster, Javascript function clustering
http://gwt-code-reviews.appspot.com/669801
Review by: spoon@google.com
git-svn-id: https://google-web-toolkit.googlecode.com/svn/trunk@8890 8db76d5a-ed1c-0410-87a9-c151d255dfc7
diff --git a/dev/core/src/com/google/gwt/dev/util/editdistance/CharIndex.java b/dev/core/src/com/google/gwt/dev/util/editdistance/CharIndex.java
new file mode 100644
index 0000000..097ccb5
--- /dev/null
+++ b/dev/core/src/com/google/gwt/dev/util/editdistance/CharIndex.java
@@ -0,0 +1,371 @@
+/*
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.gwt.dev.util.editdistance;
+
+/**
+ * A performance-oriented character-indexed map.
+ * Provides a mapping from characters in a limited alphabet
+ * (defined by a CharSequence) to integer values (for array indexing).
+ * Emphasis is on performance (e.g., compared to using a generic HashMap),
+ * in terms of speed of (post-construction) lookup and space required.
+ *
+ * Maps are constructed using a static <tt>getInstance</tt> method that
+ * chooses an efficient implementation for a given string.
+ * The map interface consists of:
+ * a <tt>size</tt> method for determining the maximum range of indices used;
+ * a <tt>lookup</tt> method for translating a character to its index; and,
+ * a <tt>nullElement</tt> method that returns the index that is not
+ * returned by <tt>lookup</tt> for any characer actually in the map.
+ *
+ * The target application is the mapping of natural-language strings.
+ * Most strings include characters from a single language, and thus
+ * fall into a small slice of the Unicode space. In particular,
+ * many strings fall into just the Latin series.
+ */
+public abstract class CharIndex {
+ /**
+ * An index based on a garden-variety hash map.
+ * This is the implementation of last recourse (from a
+ * performance perspective).
+ *
+ * TODO(mwyoung): there may be value in an implementation
+ * that falls in between the masked and fullhash variants;
+ * for example, one using a closed hash table with rehashing.
+ */
+ public static class FullHash extends CharIndex {
+ /**
+ * Mutable holder for a character
+ */
+ static class Char {
+ char c;
+ @Override
+ public boolean equals(Object x) {
+ return (x != null) && (((Char)x).c == this.c);
+ }
+ @Override
+ public int hashCode() {
+ return c;
+ }
+ @Override
+ public String toString() {
+ return "'" + c + "'";
+ }
+ }
+
+ static final int NULL_ELEMENT = 0;
+ protected int lastUsed = NULL_ELEMENT;
+ /**
+ * Mapping from pattern characters to their integer index values
+ */
+ final java.util.HashMap<Char,Integer> map;
+
+ /**
+ * Constructs a full hash-based mapping
+ */
+ FullHash(CharSequence s) {
+ /* Choose a hash size larger at least twice the string length */
+ int len = s.length();
+ int power = Integer.highestOneBit(len);
+
+ /* Create the map */
+ map = new java.util.HashMap<Char,Integer>(
+ power << ((power == len) ? 1 : 2));
+
+ /*
+ * Add characters one at a time to the map
+ */
+
+ Char test = new Char(); /* (re)used for lookup */
+ for (int i = 0; i < s.length(); i++) {
+ test.c = s.charAt(i);
+ if (map.get(test) == null) {
+ /* Not there... add it */
+ map.put(test, new Integer(++lastUsed));
+
+ /* Grow a new holder for subsequent lookups */
+ test = new Char();
+ }
+ }
+ }
+
+ @Override
+ public int lookup(char c) {
+ final Char lookupTest = new Char();
+ lookupTest.c = c;
+ Integer result = map.get(lookupTest);
+ return (result != null) ? result.intValue() : 0;
+ }
+
+ @Override
+ public int[] map(CharSequence s, int[] mapped) {
+ /* Create one mutable Char, and reuse it for all lookups */
+ final Char lookupTest = new Char();
+
+ int len = s.length();
+ if (mapped.length < len) {
+ mapped = new int[len];
+ }
+
+ for (int i = 0; i < len; i++) {
+ lookupTest.c = s.charAt(i);
+ Integer result = map.get(lookupTest);
+ mapped[i] = (result != null) ? result.intValue() : NULL_ELEMENT;
+ }
+ return mapped;
+ }
+
+ @Override
+ public int nullElement() {
+ return NULL_ELEMENT;
+ }
+
+ @Override
+ public int size() {
+ return lastUsed + 1;
+ }
+ }
+
+ /**
+ * An index based on a simple mask: index(c) = (c & MASK).
+ * This allows for a very fast index function for many
+ * languages outside of the ASCII spectrum. Most languages
+ * fit in a single 8-bit Unicode slice. Even languages
+ * that cross slices for special characters (such as extended
+ * Latin) often have no collisions under simple masking.
+ */
+ public static class Masked extends CharIndex {
+ /**
+ * Hash table size
+ */
+ static final int SIZE = 0x100;
+
+ /**
+ * Mask used for hashing
+ */
+ static final int MASK = (SIZE - 1);
+
+ /**
+ * Where we may invalid characters: beyond the hash table
+ */
+ static final int NULL_ELEMENT = SIZE;
+
+
+ /**
+ * Generates an instance of this implementation if possible.
+ * @param s pattern string
+ * @return an instance of this CharIndex if the pattern satisfies
+ * the constraints for this implementation; otherwise, null
+ */
+ static Masked generate(CharSequence s) {
+ /*
+ * This implementation requires that there be no hash collisions
+ * among the pattern characters, using a simple mask-based hash.
+ */
+ char [] contains = new char[SIZE];
+
+ /* Ensure that for all x, hash(contains[x]) != x initially. */
+ contains[0] = (char)1;
+
+ /* Hash characters, recording values seen, rejecting collisions */
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ int index = c & MASK;
+ if (contains[index] != c) {
+ if ((contains[index] & MASK) == index) {
+ return null;
+ }
+ contains[index] = c;
+ }
+ }
+ return new Masked(contains);
+ }
+
+ /**
+ * Closed hash table: characters actually indexed.
+ * Unused cells contain values inappropriate to their index.
+ */
+ final char[] contains;
+
+ /**
+ * Constructor based on hash table built by generate()
+ */
+ private Masked(char [] contains) {
+ this.contains = contains;
+ }
+
+ @Override
+ public int lookup(char c) {
+ int index = c & MASK;
+ return (c == contains[index]) ? index : NULL_ELEMENT;
+ }
+
+ @Override
+ public int[] map(CharSequence s, int[] mapped) {
+ int len = s.length();
+ if (mapped.length < len) mapped = new int[len];
+ for (int i = 0; i < len; i++) {
+ char c = s.charAt(i);
+ int index = c & MASK;
+ mapped[i] = (c == contains[index]) ? index : NULL_ELEMENT;
+ }
+ return mapped;
+ }
+
+ @Override
+ public int nullElement() { return NULL_ELEMENT; }
+
+ @Override
+ public int size() { return NULL_ELEMENT + 1; }
+ }
+
+ /**
+ * An index based on the identity mapping: index(c) = c.
+ * This requires minimal computation and no additional space
+ * for the basic ASCII character set.
+ */
+ public static class Straight extends CharIndex {
+ /**
+ * The largest character we will map (directly)
+ */
+ static final int MAX = 0x80;
+
+ /**
+ * A mask used to find characters that fall outside
+ */
+ static final int MASK = ~(MAX - 1);
+
+ /**
+ * A map result we never generate for valid characters
+ */
+ static final int NULL_ELEMENT = MAX;
+
+
+ /**
+ * Generates an instance of this implementation if possible.
+ * @param s pattern string
+ * @return an instance of this CharIndex if the pattern satisfies
+ * the constraints for this implementation; otherwise, null
+ */
+ static Straight generate(CharSequence s) {
+ /* This implementation requires that all characters fall below MAX */
+ for (int i = 0; i < s.length(); i++) {
+ if ((s.charAt(i) & MASK) != 0) {
+ return null;
+ }
+ }
+
+ return new Straight();
+ }
+
+ /**
+ * Simple private constructor, no state required
+ */
+ private Straight() { }
+
+ @Override
+ public int lookup(char c) {
+ return ((c & MASK) == 0) ? c : NULL_ELEMENT;
+ }
+
+ @Override
+ public int[] map(CharSequence s, int[] mapped) {
+ int len = s.length();
+ if (mapped.length < len) mapped = new int[len];
+ for (int i = 0; i < len; i++) {
+ char c = s.charAt(i);
+ mapped[i] = ((c & MASK) == 0) ? c : NULL_ELEMENT;
+ }
+ return mapped;
+ }
+
+ @Override
+ public int nullElement() {
+ return NULL_ELEMENT;
+ }
+
+ @Override
+ public int size() {
+ return NULL_ELEMENT + 1;
+ }
+ }
+
+ /**
+ * Generates an index for a given alphabet, defined by a character sequence.
+ * An appropriate implementation is chosen for the alphabet. The
+ * character sequence is allowed to contain multiple instances of
+ * the same character.
+ *
+ * @param s the alphabet to be indexed
+ * @return an index appropriate to this alphabet
+ */
+ public static CharIndex getInstance(CharSequence s) {
+ CharIndex result;
+
+ /* Try fastest mappings first, then more general ones */
+
+ if ((result = Straight.generate(s)) != null) {
+ return result;
+ }
+
+ if ((result = Masked.generate(s)) != null) {
+ return result;
+ }
+
+ /* Full hash always works */
+ return new FullHash(s);
+ }
+
+ /**
+ * Returns the index (mapping result) for a given character.
+ * If the character is present in the alphabet, then a unique index
+ * is returned; other characters may be mapped to unique
+ * values, *or* may be mapped to a shared <tt>nullElement</tt> value.
+ * @param c character for which an index is sought
+ * @return an integer index for the character
+ */
+ public abstract int lookup(char c);
+
+ /**
+ * Performs lookups on an entire sequence of characters.
+ * This allows an entire string to be mapped with fewer
+ * method invocations, and possibly with some added internal
+ * efficiencies.
+ *
+ * @param s character sequence to be indexed
+ * @param mapped array for storing results
+ * @return an array of indices for the character sequence,
+ * either the <tt>mapped</tt> parameter or a larger newly-allocated
+ * array if required
+ */
+ public abstract int[] map(CharSequence s, int[] mapped);
+
+ /**
+ * Returns an index that is not used for any character in
+ * the alphabet. This index *may* be returned for characters
+ * not in the alphabet.
+ * @return an index not used for a valid character
+ */
+ public abstract int nullElement();
+
+ /**
+ * Returns the maximum index result that can be returned by
+ * the <tt>lookup</tt> function (including the <tt>nullElement</tt> result
+ * if applicable). No negative values are ever returned
+ * from <tt>lookup</tt>.
+ * @return maximum index returned from any <tt>lookup</tt>
+ */
+ public abstract int size();
+}
diff --git a/dev/core/src/com/google/gwt/dev/util/editdistance/GeneralEditDistance.java b/dev/core/src/com/google/gwt/dev/util/editdistance/GeneralEditDistance.java
new file mode 100644
index 0000000..db3f45a
--- /dev/null
+++ b/dev/core/src/com/google/gwt/dev/util/editdistance/GeneralEditDistance.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.gwt.dev.util.editdistance;
+
+/**
+ * An engine definition for computing string edit distances, and
+ * implementation generators.
+ *
+ * Generally, edit distance is the minimum number of edit operations required
+ * to transform one string into another. Being a minimum transform, it
+ * forms a metric space (one that is non-negative, symmetric, and obeys
+ * triangle inequality).
+ *
+ * The most common form is Levenshtein distance, where the set of operations
+ * consists of:
+ * deleting one character;
+ * inserting one character; or,
+ * substituting one character for another,
+ * each having a "cost" of 1.
+ *
+ * Another common form is Damerau-Levenshtein distance, which adds
+ * transpose two adjacent characters
+ * to the set of operations. Note that many implementations of
+ * Damerau distance do not account for insertion after transposition;
+ * this "restricted" Damerau distance does not obey triangle inequality,
+ * but is substantially easier to compute than the unrestricted metric.
+ *
+ * Another variation is to apply differential costs for the various
+ * operations. For example, substitutions could be weighted by
+ * character type (e.g., vowel<=>vowel being a cheaper edit than
+ * vowel<=>consonant).
+ *
+ * This class defines an engine for computing edit distance from
+ * a fixed ("pattern") string to other ("target") strings. The pattern
+ * is fixed at instantiation time; this allows an implementation to
+ * perform precomputations to make individual target computations faster.
+ *
+ * To allow further performance optimization, the GeneralEditDistance
+ * contract does not require thread-safety. A <tt>duplicate</tt> method is
+ * defined to allow additional equivalent instances to be created
+ * for separate threads; it is essentially a type-knowledgeable <tt>clone</tt>.
+ */
+public interface GeneralEditDistance {
+ /**
+ * Creates a duplicate (clone) of this distance engine, typically
+ * in order to use it in another thread (as the GeneralEditDistance
+ * contract does not specify thread safety).
+ *
+ * This method differs from Object.clone() in two ways:
+ * it is typed (to GeneralEditDistance); and, it must not throw
+ * an exception.
+ *
+ * @return a clone of this instance.
+ */
+ GeneralEditDistance duplicate();
+
+ /**
+ * Computes edit distance from the pattern string (used to
+ * construct this GeneralEditDistance instance) to a given target
+ * string, bounded by a limit of interest.
+ *
+ * The specified limit must be non-negative. If the actual distance
+ * exceeds the limit, this method may return *any* value above the limit;
+ * it need not be the actual distance. The behavior of
+ * illegal limit values is undefined; an implementation may
+ * return an incorrect result or raise a runtime exception.
+ *
+ * @param target string for which distance is to be compared
+ * against a predefined pattern
+ * @param limit maximum distance for which exact result is
+ * required (and beyond which any over-limit result
+ * can be returned)
+ * @return edit distance from the predefined pattern to the
+ * target (or any value above limit, if the actual distance
+ * exceeds the prescribed limit)
+ */
+ int getDistance(CharSequence target, int limit);
+}
diff --git a/dev/core/src/com/google/gwt/dev/util/editdistance/GeneralEditDistances.java b/dev/core/src/com/google/gwt/dev/util/editdistance/GeneralEditDistances.java
new file mode 100644
index 0000000..32ebde2
--- /dev/null
+++ b/dev/core/src/com/google/gwt/dev/util/editdistance/GeneralEditDistances.java
@@ -0,0 +1,212 @@
+/*
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.gwt.dev.util.editdistance;
+
+/**
+ * A collection of instance generators for the GeneralEditDistance interface.
+ */
+public class GeneralEditDistances {
+ /**
+ * Chooses the best implementation of Levenshtein string edit distance
+ * available at the current time.
+ */
+ /*
+ * As of 2007-08-23, the best algorithm known (to the author=mwyoung) for
+ * short strings is one due to Eugene Myers, except for the special case
+ * where the distance limit is 0 or 1. The Myers algorithm also has good
+ * worst-case performance for long strings when the edit distance is not
+ * reasonably bounded.
+ *
+ * When there is a good bound, a variant of the Ukkonen algorithm due to
+ * Berghel and Roach (modified by Michael Young to use linear space)
+ * is faster for long strings.
+ *
+ * Note that other algorithms that perform better in some cases for running
+ * text searches do not outperform Myers for rigid distance computations.
+ * Notably:
+ * Navarro/Baeza-Yates (Algorithmica 23,2) simulates an NFA with an
+ * epsilon-cycle on the initial state (appropriate for running texts)
+ * and reports success without computing exact distance. When adjusted
+ * to a fixed starting point and computing distance, its state machine
+ * is larger and it underperforms.
+ *
+ * BITAP (Baeza-Yates/Gonnet, Manber/Wu) also simulates an NFA, and
+ * Navarro claims that it wins for small patterns and small limits for
+ * running search. Experiments with a Java implementation showed that
+ * it beat Myers on pure string edit distance only for limits where the
+ * special 0-1 limit applied, where special-case comparison beats all.
+ *
+ * A survey of algorithms for running text search by Navarro appeared
+ * in ACM Computing Surveys 33#1: http://portal.acm.org/citation.cfm?id=375365
+ * Another algorithm (Four Russians) that Navarro claims superior for very
+ * long patterns and high limits was not evaluated for inclusion here.
+ * Filtering algorithms also improve running search, but do not help
+ * for pure edit distance.
+ */
+ private static class Levenshtein implements GeneralEditDistance {
+ /**
+ * Long+bounded implementation class: distance-only Berghel-Roach
+ */
+ private ModifiedBerghelRoachEditDistance berghel;
+
+ /**
+ * Short/unbounded implementation class: Myers bit-parallel
+ */
+ private MyersBitParallelEditDistance myers;
+
+ /**
+ * Saved pattern, for specialized comparisons
+ */
+ private final CharSequence pattern;
+
+ /**
+ * Length of saved pattern
+ */
+ private final int patternLength;
+
+ private Levenshtein(CharSequence pattern) {
+ this.pattern = pattern;
+ this.patternLength = pattern.length();
+ }
+
+ public GeneralEditDistance duplicate() {
+ Levenshtein dup = new Levenshtein(pattern);
+
+ /* Duplicate the Myers engine, as it is cheaper than rebuilding */
+ if (this.myers != null) {
+ dup.myers = (MyersBitParallelEditDistance) this.myers.duplicate();
+ }
+
+ /* Do not duplicate the Berghel engine; it provides no savings. */
+
+ return dup;
+ }
+
+ public int getDistance(CharSequence target, int limit) {
+ /* When the limit is 0 or 1, specialized comparisons are much faster. */
+ if (limit <= 1) {
+ return limit == 0 ?
+ (pattern.equals(target) ? 0 : 1) :
+ atMostOneError(pattern, target);
+ }
+
+ /*
+ * The best algorithm for long strings depends on the resulting
+ * edit distance (or the limit placed on it). Without further
+ * information on the likelihood of a low distance, we guess
+ * based on the provided limit. We currently lean toward using
+ * the Myers algorithm unless we are pretty sure that the
+ * Berghel-Roach algorithm will win (based on the limit).
+ *
+ * Note that when the string lengths are small (fewer characters
+ * than bits in a long), Myers wins regardless of limit.
+ */
+ if ((patternLength > 64)
+ && (limit < (target.length() / 10))) {
+ if (berghel == null) {
+ berghel = ModifiedBerghelRoachEditDistance.getInstance(pattern);
+ }
+ return berghel.getDistance(target, limit);
+ }
+
+ if (myers == null) {
+ myers = MyersBitParallelEditDistance.getInstance(pattern);
+ }
+
+ return myers.getDistance(target, limit);
+ }
+ }
+
+ /**
+ * Compares two strings for at most one insert/delete/substitute difference.
+ * Since operations cannot be composed, a simple case analysis is possible.
+ *
+ * @param s1 one string to be compared
+ * @param s2 the other string to be compared
+ * @return Levenshtein edit distance if no greater than 1;
+ * otherwise, more than 1
+ */
+ public static int atMostOneError(CharSequence s1, CharSequence s2) {
+ int s1Length = s1.length();
+ int s2Length = s2.length();
+ int errors = 0; /* running count of edits required */
+
+ switch(s2Length - s1Length) {
+ /*
+ * Strings are the same length. No single insert/delete is possible;
+ * at most one substitution can be present.
+ */
+ case 0:
+ for (int i = 0; i < s2Length; i++) {
+ if ((s2.charAt(i) != s1.charAt(i)) && (errors++ != 0)) {
+ break;
+ }
+ }
+ return errors;
+
+ /*
+ * Strings differ in length by 1, so we have an insertion
+ * (and therefore cannot have any other substitutions).
+ */
+ case 1: /* s2Length > s1Length */
+ for (int i = 0; i < s1Length; i++) {
+ if (s2.charAt(i) != s1.charAt(i)) {
+ for (; i < s1Length; i++) {
+ if (s2.charAt(i + 1) != s1.charAt(i)) {
+ return 2;
+ }
+ }
+ return 1;
+ }
+ }
+ return 1;
+
+ /* Same as above case, with strings reversed */
+ case -1: /* s1Length > s2Length */
+ for (int i = 0; i < s2Length; i++) {
+ if (s2.charAt(i) != s1.charAt(i)) {
+ for (; i < s2Length; i++) {
+ if (s2.charAt(i) != s1.charAt(i + 1)) {
+ return 2;
+ }
+ }
+ return 1;
+ }
+ }
+ return 1;
+
+ /* Edit distance is at least difference in lengths; more than 1 here. */
+ default:
+ return 2;
+ }
+ }
+
+ /**
+ * Generates an GeneralEditDistance engine for a particular pattern string
+ * based on Levenshtein distance. Caller must ensure that the
+ * pattern does not change (consider using pattern.toString() if
+ * necessary) as long as the generated object is to be used.
+ *
+ * @param pattern a string from which distance computations are desired
+ * @return an engine for computing Levenshtein distances from that pattern
+ */
+ public static GeneralEditDistance
+ getLevenshteinDistance(CharSequence pattern) {
+ return new Levenshtein(pattern);
+ }
+
+ private GeneralEditDistances() { }
+}
diff --git a/dev/core/src/com/google/gwt/dev/util/editdistance/ModifiedBerghelRoachEditDistance.java b/dev/core/src/com/google/gwt/dev/util/editdistance/ModifiedBerghelRoachEditDistance.java
new file mode 100644
index 0000000..de61af5
--- /dev/null
+++ b/dev/core/src/com/google/gwt/dev/util/editdistance/ModifiedBerghelRoachEditDistance.java
@@ -0,0 +1,480 @@
+/*
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.gwt.dev.util.editdistance;
+
+/**
+ * A modified version of a string edit distance described by Berghel and
+ * Roach that uses only O(d) space and O(n*d) worst-case time, where n is
+ * the pattern string length and d is the edit distance computed.
+ * We achieve the space reduction by keeping only those sub-computations
+ * required to compute edit distance, giving up the ability to
+ * reconstruct the edit path.
+ */
+public class ModifiedBerghelRoachEditDistance implements GeneralEditDistance {
+ /*
+ * This is a modification of the original Berghel-Roach edit
+ * distance (based on prior work by Ukkonen) described in
+ * ACM Transactions on Information Systems, Vol. 14, No. 1,
+ * January 1996, pages 94-106.
+ *
+ * I observed that only O(d) prior computations are required
+ * to compute edit distance. Rather than keeping all prior
+ * f(k,p) results in a matrix, we keep only the two "outer edges"
+ * in the triangular computation pattern that will be used in
+ * subsequent rounds. We cannot reconstruct the edit path,
+ * but many applications do not require that; for them, this
+ * modification uses less space (and empirically, slightly
+ * less time).
+ *
+ * First, some history behind the algorithm necessary to understand
+ * Berghel-Roach and our modification...
+ *
+ * The traditional algorithm for edit distance uses dynamic programming,
+ * building a matrix of distances for substrings:
+ * D[i,j] holds the distance for string1[0..i]=>string2[0..j].
+ * The matrix is initially populated with the trivial values
+ * D[0,j]=j and D[i,0]=i; and then expanded with the rule:
+ * <pre>
+ * D[i,j] = min( D[i-1,j]+1, // insertion
+ * D[i,j-1]+1, // deletion
+ * (D[i-1,j-1]
+ * + (string1[i]==string2[j])
+ * ? 0 // match
+ * : 1 // substitution ) )
+ * </pre>
+ *
+ * Ukknonen observed that each diagonal of the matrix must increase
+ * by either 0 or 1 from row to row. If D[i,j] = p, then the
+ * then the matching rule requires that D[i+x,j+x] = p for all
+ * x where string1[i..i+x) matches string2[j..j+j+x). Ukkonen
+ * defined a function f(k,p) as the highest row number in which p
+ * appears on the k-th diagonal (those D[i,j] where k=(i-j), noting
+ * that k may be negative). The final result of the edit
+ * distance is the D[n,m] cell, on the (n-m) diagonal; it is
+ * the value of p for which f(n-m, p) = m. The function f can
+ * also be computed dynamically, according to a simple recursion:
+ * <pre>
+ * f(k,p) {
+ * contains_p = max(f(k-1,p-1), f(k,p-1)+1, f(k+1,p-1)+1)
+ * while (string1[contains_p] == string2[contains_p + k])
+ * contains_p++;
+ * return contains_p;
+ * }
+ * </pre>
+ * The max() expression finds a row where the k-th diagonal must
+ * contain p by virtue of an edit from the prior, same, or following
+ * diagonal (corresponding to an insert, substitute, or delete);
+ * we need not consider more distant diagonals because row-to-row
+ * and column-to-column changes are at most +/- 1.
+ *
+ * The original Ukkonen algorithm computed f(k,p) roughly as
+ * follows:
+ * <pre>
+ * for (p = 0; ; p++) {
+ * compute f(k,p) for all valid k
+ * if (f(n-m, p) == m) return p;
+ * }
+ * </pre>
+ *
+ * Berghel and Roach observed that many values of f(k,p) are
+ * computed unnecessarily, and reorganized the computation into
+ * a just-in-time sequence. In each iteration, we are primarily
+ * interested in the terminating value f(main,p), where main=(n-m)
+ * is the main diagonal. To compute that we need f(x,p-1) for
+ * three values of x: main-1, main, and main+1. Those depend on
+ * values for p-2, and so forth. We will already have computed
+ * f(main,p-1) in the prior round, and thus f(main-1,p-2) and
+ * f(main+1,p-2), and so forth. The only new values we need to compute
+ * are on the edges: f(main-i,p-i) and f(main+i,p-i). Noting that
+ * f(k,p) is only meaningful when abs(k) is no greater than p,
+ * one of the Berghel-Roach reviewers noted that we can compute
+ * the bounds for i:
+ * <pre>
+ * (main+i &le p-i) implies (i ≤ (p-main)/2)
+ * </pre>
+ * (where main+i is limited on the positive side) and similarly
+ * <pre>
+ * (-(main-i) &le p-i) implies (i ≤ (p+main)/2).
+ * </pre>
+ * (where main-i is limited on the negative side).
+ *
+ * This reduces the computation sequence to
+ * <pre>
+ * for (i = (p-main)/2; i > 0; i--) compute f(main+i,p-i);
+ * for (i = (p+main)/2; i > 0; i--) compute f(main-i,p-i);
+ * if (f(main, p) == m) return p;
+ * </pre>
+ *
+ * The original Berghel-Roach algorithm recorded prior values
+ * of f(k,p) in a matrix, using O(distance^2) space, enabling
+ * reconstruction of the edit path, but if all we want is the
+ * edit *distance*, we only need to keep O(distance) prior computations.
+ *
+ * The requisite prior k-1, k, and k+1 values are conveniently
+ * computed in the current round and the two preceding it.
+ * For example, on the higher-diagonal side, we compute:
+ * <pre>
+ * current[i] = f(main+i, p-i)
+ * </pre>
+ * We keep the two prior rounds of results, where p was one and two
+ * smaller. So, from the preceidng round
+ * <pre>
+ * last[i] = f(main+i, (p-1)-i)
+ * </pre>
+ * and from the prior round, but one position back:
+ * <pre>
+ * prior[i-1] = f(main+(i-1), (p-2)-(i-1))
+ * </pre>
+ * In the current round, one iteration earlier:
+ * <pre>
+ * current[i+1] = f(main+(i+1), p-(i+1))
+ * </pre>
+ * Note that the distance in all of these evaluates to p-i-1,
+ * and the diagonals are (main+i) and its neighbors... just
+ * what we need. The lower-diagonal side behaves similarly.
+ *
+ * We need to materialize values that are not computed in prior
+ * rounds, for either of two reasons: <ul>
+ * <li> Initially, we have no prior rounds, so we need to fill
+ * all of the "last" and "prior" values for use in the
+ * first round. The first round uses only on one side
+ * of the main diagonal or the other.
+ * <li> In every other round, we compute one more diagonal than before.
+ * </ul>
+ * In all of these cases, the missing f(k,p) values are for abs(k) > p,
+ * where a real value of f(k,p) is undefined. [The original Berghel-Roach
+ * algorithm prefills its F matrix with these values, but we fill
+ * them as we go, as needed.] We define
+ * <pre>
+ * f(-p-1,p) = p, so that we start diagonal -p with row p,
+ * f(p+1,p) = -1, so that we start diagonal p with row 0.
+ * </pre>
+ * (We also allow f(p+2,p)=f(-p-2,p)=-1, causing those values to
+ * have no effect in the starting row computation.]
+ *
+ * We only expand the set of diagonals visited every other round,
+ * when (p-main) or (p+main) is even. We keep track of even/oddness
+ * to save some arithmetic. The first round is always even, as p=abs(main).
+ * Note that we rename the "f" function to "computeRow" to be Googley.
+ */
+
+ private static final int [] EMPTY_INT_ARRAY = new int[0];
+
+ /**
+ * Creates an instance for computing edit distance from {@code pattern}.
+ * @param pattern string from which distances are measured
+ * @return an instance for computing distances from the pattern
+ */
+ public static ModifiedBerghelRoachEditDistance
+ getInstance(CharSequence pattern) {
+ return getInstance(pattern.toString());
+ }
+
+ /**
+ * Creates an instance for computing edit distance from {@code pattern}.
+ * @param pattern string from which distances are measured
+ * @return an instance for computing distances from the pattern
+ */
+ public static ModifiedBerghelRoachEditDistance
+ getInstance(String pattern) {
+ return new ModifiedBerghelRoachEditDistance(pattern.toCharArray());
+ }
+
+ /*
+ * The current and two preceding sets of Ukkonen f(k,p) values for diagonals
+ * around the main, computed by the main loop of {@code getDistance}. These
+ * arrays are retained between calls to save allocation costs. (They are all
+ * initialized to a real array so that we can indiscriminately use length
+ * when ensuring/resizing.)
+ */
+ private int[] currentLeft = EMPTY_INT_ARRAY;
+
+ private int[] currentRight = EMPTY_INT_ARRAY;
+
+ private int[] lastLeft = EMPTY_INT_ARRAY;
+
+ private int[] lastRight = EMPTY_INT_ARRAY;
+
+ /**
+ * The "pattern" string against which others are compared
+ */
+ private final char[] pattern;
+
+ private int[] priorLeft = EMPTY_INT_ARRAY;
+
+ private int[] priorRight = EMPTY_INT_ARRAY;
+
+ private ModifiedBerghelRoachEditDistance(char[] pattern) {
+ this.pattern = pattern;
+ }
+
+ public ModifiedBerghelRoachEditDistance duplicate() {
+ return new ModifiedBerghelRoachEditDistance(pattern);
+ }
+
+ public int getDistance(CharSequence targetSequence, int limit) {
+ final int targetLength = targetSequence.length();
+
+ /*
+ * Compute the main diagonal number.
+ * The final result lies on this diagonal.
+ */
+ final int main = pattern.length - targetLength;
+
+ /*
+ * Compute our initial distance candidate.
+ * The result cannot be less than the difference in
+ * string lengths, so we start there.
+ */
+ int distance = Math.abs(main);
+ if (distance > limit) {
+ /* More than we wanted. Give up right away */
+ return Integer.MAX_VALUE;
+ }
+
+ /* Snag a copy of the second string for efficiency */
+ final char[] target = new char[targetLength];
+ for (int i = 0; i < targetLength; i++) {
+ target[i] = targetSequence.charAt(i);
+ }
+
+ /*
+ * In the main loop below, the current{Right,Left} arrays record results
+ * from the current outer loop pass. The last{Right,Left} and
+ * prior{Right,Left} arrays hold the results from the preceding two passes.
+ * At the end of the outer loop, we shift them around (reusing the prior
+ * array as the current for the next round, to avoid reallocating).
+ * The Right reflects higher-numbered diagonals, Left lower-numbered.
+ */
+
+ /*
+ * Fill in "prior" values for the first two passes through
+ * the distance loop. Note that we will execute only one side of
+ * the main diagonal in these passes, so we only need
+ * initialize one side of prior values.
+ */
+
+ if (main <= 0) {
+ ensureCapacityRight(distance, false);
+ for (int j = 0; j <= distance; j++) {
+ lastRight[j] = distance - j - 1; /* Make diagonal -k start in row k */
+ priorRight[j] = -1;
+ }
+ } else {
+ ensureCapacityLeft(distance, false);
+ for (int j = 0; j <= distance; j++) {
+ lastLeft[j] = -1; /* Make diagonal +k start in row 0 */
+ priorLeft[j] = -1;
+ }
+ }
+
+ /*
+ * Keep track of even rounds. Only those rounds consider new diagonals,
+ * and thus only they require artificial "last" values below.
+ */
+ boolean even = true;
+
+ /*
+ * MAIN LOOP: try each successive possible distance until one succeeds.
+ */
+ while (true) {
+ /*
+ * Before calling computeRow(main, distance), we need to fill in
+ * missing cache elements. See the high-level description above.
+ */
+
+ /*
+ * Higher-numbered diagonals
+ */
+
+ int offDiagonal = (distance - main) / 2;
+ ensureCapacityRight(offDiagonal, true);
+
+ if (even) {
+ /* Higher diagonals start at row 0 */
+ lastRight[offDiagonal] = -1;
+ }
+
+ int immediateRight = -1;
+ for (; offDiagonal > 0; offDiagonal--) {
+ currentRight[offDiagonal] = immediateRight = computeRow(
+ (main + offDiagonal),
+ (distance - offDiagonal),
+ pattern,
+ target,
+ priorRight[offDiagonal - 1],
+ lastRight[offDiagonal],
+ immediateRight);
+ }
+
+
+ /*
+ * Lower-numbered diagonals
+ */
+
+ offDiagonal = (distance + main) / 2;
+ ensureCapacityLeft(offDiagonal, true);
+
+ if (even) {
+ /* Lower diagonals, fictitious values for f(-x-1,x) = x */
+ lastLeft[offDiagonal] = (distance - main) / 2 - 1;
+ }
+
+ int immediateLeft = even ? -1 : (distance - main) / 2;
+
+ for (; offDiagonal > 0; offDiagonal--) {
+ currentLeft[offDiagonal] = immediateLeft = computeRow(
+ (main - offDiagonal),
+ (distance - offDiagonal),
+ pattern, target,
+ immediateLeft,
+ lastLeft[offDiagonal],
+ priorLeft[offDiagonal - 1]);
+ }
+
+ /*
+ * We are done if the main diagonal has distance in the last row.
+ */
+ int mainRow = computeRow(main, distance, pattern, target,
+ immediateLeft, lastLeft[0], immediateRight);
+
+ if ((mainRow == targetLength) || (++distance > limit) || (distance < 0)) {
+ break;
+ }
+
+ /* The [0] element goes to both sides. */
+ currentLeft[0] = currentRight[0] = mainRow;
+
+ /* Rotate rows around for next round: current=>last=>prior (=>current) */
+ int[] tmp = priorLeft;
+ priorLeft = lastLeft;
+ lastLeft = currentLeft;
+ currentLeft = priorLeft;
+
+ tmp = priorRight;
+ priorRight = lastRight;
+ lastRight = currentRight;
+ currentRight = tmp;
+
+ /* Update evenness, too */
+ even = !even;
+ }
+
+ return distance;
+ }
+
+ /**
+ * Computes the highest row in which the distance {@code p} appears
+ * in diagonal {@code k} of the edit distance computation for
+ * strings {@code a} and {@code b}. The diagonal number is
+ * represented by the difference in the indices for the two strings;
+ * it can range from {@code -b.length()} through {@code a.length()}.
+ *
+ * More precisely, this computes the highest value x such that
+ * <pre>
+ * p = edit-distance(a[0:(x+k)), b[0:x)).
+ * </pre>
+ *
+ * This is the "f" function described by Ukkonen.
+ *
+ * The caller must assure that abs(k) ≤ p, the only values for
+ * which this is well-defined.
+ *
+ * The implementation depends on the cached results of prior
+ * computeRow calls for diagonals k-1, k, and k+1 for distance p-1.
+ * These must be supplied in {@code knownLeft}, {@code knownAbove},
+ * and {@code knownRight}, respectively.
+ * @param k diagonal number
+ * @param p edit distance
+ * @param a one string to be compared
+ * @param b other string to be compared
+ * @param knownLeft value of {@code computeRow(k-1, p-1, ...)}
+ * @param knownAbove value of {@code computeRow(k, p-1, ...)}
+ * @param knownRight value of {@code computeRow(k+1, p-1, ...)}
+ */
+ private int computeRow(int k, int p, char[] a, char[] b,
+ int knownLeft, int knownAbove, int knownRight) {
+ assert (Math.abs(k) <= p);
+ assert (p >= 0);
+
+ /*
+ * Compute our starting point using the recurrance.
+ * That is, find the first row where the desired edit distance
+ * appears in our diagonal. This is at least one past
+ * the highest row for
+ */
+ int t;
+ if (p == 0) {
+ t = 0;
+ } else {
+ /*
+ * We look at the adjacent diagonals for the next lower edit distance.
+ * We can start in the next row after the prior result from
+ * our own diagonal (the "substitute" case), or the next diagonal
+ * ("delete"), but only the same row as the prior result from
+ * the prior diagonal ("insert").
+ */
+ t = Math.max(Math.max(knownAbove, knownRight) + 1, knownLeft);
+ }
+
+ /*
+ * Look down our diagonal for matches to find the maximum
+ * row with edit-distance p.
+ */
+ int tmax = Math.min(b.length, (a.length - k));
+
+ while ((t < tmax) && b[t] == a[t + k]) {
+ t++;
+ }
+
+ return t;
+ }
+
+ /*
+ * Ensures that the Left arrays can be indexed through {@code index},
+ * inclusively, resizing (and copying) as necessary.
+ */
+ private void ensureCapacityLeft(int index, boolean copy) {
+ if (currentLeft.length <= index) {
+ index++;
+ priorLeft = resize(priorLeft, index, copy);
+ lastLeft = resize(lastLeft, index, copy);
+ currentLeft = resize(currentLeft, index, false);
+ }
+ }
+
+ /*
+ * Ensures that the Right arrays can be indexed through {@code index},
+ * inclusively, resizing (and copying) as necessary.
+ */
+ private void ensureCapacityRight(int index, boolean copy) {
+ if (currentRight.length <= index) {
+ index++;
+ priorRight = resize(priorRight, index, copy);
+ lastRight = resize(lastRight, index, copy);
+ currentRight = resize(currentRight, index, false);
+ }
+ }
+
+ /* Resize an array, copying old contents if requested */
+ private int[] resize(int[] array, int size, boolean copy) {
+ int[] result = new int[size];
+ if (copy) {
+ System.arraycopy(array, 0, result, 0, array.length);
+ }
+ return result;
+ }
+}
diff --git a/dev/core/src/com/google/gwt/dev/util/editdistance/MyersBitParallelEditDistance.java b/dev/core/src/com/google/gwt/dev/util/editdistance/MyersBitParallelEditDistance.java
new file mode 100644
index 0000000..9e02ebd
--- /dev/null
+++ b/dev/core/src/com/google/gwt/dev/util/editdistance/MyersBitParallelEditDistance.java
@@ -0,0 +1,517 @@
+/*
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.gwt.dev.util.editdistance;
+
+
+/**
+ * Computes Levenshtein string-edit distance using the
+ * algorithm of Eugene Myers (see "A fast bit-vector algorithm for
+ * approximate string matching based on dynamic progamming",
+ * Journal of the ACM, 46(3): 395-415, 1999), using the reformulations
+ * of Heikki Hyyro (see "Explaining and Extending the Bit-parallel
+ * Approximate String Matching Algorithm of Myers") along with
+ * Gonzalo Navarro ("Faster Bit-Parallel Approximate String Matching",
+ * Proc. 13th Combinatorial Pattern Matching (CPM'2002)).
+ */
+public abstract class MyersBitParallelEditDistance
+ implements GeneralEditDistance, Cloneable {
+ /*
+ * The Myers algorithm is based on an adaptation of the traditional
+ * dynamic programming algorithm, in which a matrix of distances
+ * is computed column-wise (or row-wise). For strings of length
+ * M and N, an MxN matrix (D) is filled with distances for substrings:
+ * D[i,j] holds the distance for string1[0..i]=>string2[0..j].
+ * The matrix is initially populated with the trivial values
+ * D[0,j]=j and D[i,0]=i; and then expanded with the rule:
+ * D[i,j] = min( D[i-1,j]+1, // insertion
+ * D[i,j-1]+1, // deletion
+ * (D[i-1,j-1]
+ * + (string1[i]==string2[j])
+ * ? 0 // match
+ * : 1 // substitution ) )
+ *
+ * The Myers algorithm takes advantage of the observation that
+ * the difference from one matrix position to its higher diagonal
+ * is either 0 or 1, and consequently row-to-row and column-to-column
+ * differences are 1, 0, or -1. It organizes whole columns of
+ * differences into bit arrays, computing new columns with
+ * arithmetic on whole words. The edit distance is computed
+ * by tracking the column changes in the last row.
+ *
+ * The description below follows the naming from the Hyrro papers above.
+ * The pattern string (length M) is arranged vertically; the
+ * string to be matched (length N) is horizontal. Columns are
+ * represented by bit-arrays with (1<<j) set for row i. The
+ * inter-cell differences are grouped into column-variables:
+ * VN has bits set when the transition from row i-1 to row i is
+ * negative; VP has bits set when it is positive; HN and HP
+ * have bits set for column j-1 to j transitions that are
+ * negative or positive, respectively. DZ has bits set when
+ * the diagonal from (i-1,j-1) to (i,j) is zero.
+ *
+ * The computation proceeds as follows:
+ * We start at column 0, where all VP bits are 1. The distance
+ * is accumulated in row M, with initial value M in the matrix.
+ *
+ * For each new column, we compute the diagonal-zero (DZ) first.
+ * This happens under three circumstances:
+ *
+ * when the pattern matches in this position ("match" above):
+ * We make use of a bitmask, one for each character
+ * in the alphabet, that has bits set wherever that character
+ * appears in the pattern. These bitmasks are pre-computed
+ * when the pattern is established (in the class constructor).
+ * In the column loop, we simply look up the bitmask for the
+ * character in this column. We call this PM.
+ *
+ * when the prior column has a negative vertical transition
+ * in this row ("deletion"):
+ * This is simply the VN bit array from the prior column.
+ *
+ * when the row above has a negative transition in this column
+ * ("insertion").
+ * This happens when the diagonal coming into the cell above
+ * (in *this* column) is zero but the vertical transition
+ * in the preceding column for that row is positive. That is:
+ *
+ * D[i-2,j-1]
+ * | \
+ * +1 0
+ * | \
+ * D[i-1,j-1] -1 -> D[i-1,j]
+ * | \ |
+ * ? 0 +1
+ * | \ |
+ * D[i,j-1] D[i,j]
+ *
+ * That is DZ[i,j] = DZ[i-1,j] & VP[i-1,j-1].
+ * Note that this is recursive in the DZ[*,j] bit array, but we
+ * want to compute the whole array at once. We need an expression
+ * that will propagate a 1 bit down the DZ column when the associated
+ * VP bit is set. Noting that the DZ[*,0] is the least significant
+ * bit, this means propagating to more significant bits. The carry bit
+ * of an addition has this behavior. If we set
+ * DZ = (PM | VN) // first two cases
+ * then we can propagate bits up iff the corresponding VP bit
+ * is set with:
+ * DZ |= (((DZ & VP) + VP) ^ VP);
+ * The (DZ&VP) piece covers our required condition on VP. Adding
+ * the VP bits causes any 1 bits from (DZ&VP) to result in a carry,
+ * but also introduces a 1 bit where VP=1 but DZ=0, which are then
+ * cleared with the ^VP piece. The propagated bits are |ed into the
+ * base case to get a final result.
+ *
+ * Once we have DZ, we can compute HP and HN. Note that VP and VN have
+ * not yet been updated, and so refer to the preceding column; DZ refers
+ * to the current column.
+ * HN is true when DZ is true in this column but VP was true
+ * in the prior column (HN = DZ & VP)
+ * HP is true when: the prior VN is true (otherwise, the diagonal would
+ * be negative, and that cannot happen); or, DZ is false (meaning
+ * the diagonal increased by 1) and the prior-column VP is also false
+ * (HP = VN | ~(DZ|VP))
+ *
+ * Once we have HP and HN, we can tally the effect on the last row (adding
+ * or subtracting one, respectively).
+ *
+ * Finally, we can compute VP and VN for this column, in preparation
+ * for the next round. These computations are analagous to the ones for
+ * HN and HP above, except that there, the incoming V values already
+ * reflected the prior column and DZ this one, but now the H and DZ values
+ * reflect the same column. So, we first shift the H values to associate
+ * with the proper bits from DZ.
+ */
+
+ /**
+ * A trivial implementation for the zero-length string
+ */
+ static class Empty extends MyersBitParallelEditDistance {
+ Empty(CharSequence s) {
+ super(s);
+ }
+
+ @Override
+ public GeneralEditDistance duplicate() {
+ return this; /* thread-safe */
+ }
+
+ @Override
+ public int getDistance(CharSequence s, int k) {
+ return s.length();
+ }
+ }
+
+ /**
+ * An implementation using multiple integers for each column.
+ * This follows the pattern for the single-word implementation, with
+ * these changes:
+ * sums/shifts must propagate from one word to the next;
+ * to make that easier (and cheaper), we use only (SIZE-1) bits per word;
+ */
+ static class Multi extends MyersBitParallelEditDistance {
+ /* How many integers we use per column */
+ int count;
+
+ /**
+ * Where the last-row bit lives -- only in the last array slot, though
+ */
+ final int lastBitPosition;
+
+ /**
+ * Bitmaps for pattern string: [pattern_index][column_index]
+ */
+ final int[][] positions;
+
+ int[] verticalNegativesReusable;
+ /* Reusable arrays for vertical changes */
+ int[] verticalPositivesReusable;
+
+ /**
+ * A mask with those bits set
+ */
+ final int wordMask = (-1 >>> 1);
+
+ /**
+ * How many bits we use per word
+ */
+ final int wordSize = Integer.SIZE - 1;
+
+ /**
+ * Constructs a multi-word engine
+ */
+ Multi(CharSequence s) {
+ super(s);
+
+ /* Compute number of words to use (rounding up) */
+ count = (m + wordSize - 1) / wordSize;
+
+ /* Precompute bitmaps for pattern string */
+ positions = PatternBitmap.map(s, idx, new int [idx.size()][], wordSize);
+
+ /* Where in last word does last row appear */
+ lastBitPosition = (1 << ((m - 1) % wordSize));
+
+ /* Initialize scratchpad items */
+ perThreadInit();
+ }
+
+ @Override
+ public int getDistance(CharSequence s, int k) {
+ indices = idx.map(s, indices);
+
+ /* Initialize verticalPositive to all bits on, verticalNegative all off */
+ int[] verticalPositives = verticalPositivesReusable;
+ java.util.Arrays.fill(verticalPositives, wordMask);
+ int[] verticalNegatives = verticalNegativesReusable;
+ java.util.Arrays.fill(verticalNegatives, 0);
+
+ int distance = m;
+ int len = s.length();
+
+ /* We can only miss the distance-- below this many times: */
+ int maxMisses = k + len - m;
+ if (maxMisses < 0) maxMisses = Integer.MAX_VALUE;
+
+ outer:
+ for (int j = 0; j < len; j++) {
+ int[] position = positions[indices[j]];
+
+ /* Carry bits from one word to the next */
+ int sum = 0;
+ int horizontalPositiveShift = 1;
+ int horizontalNegativeShift = 0;
+
+ /* Iterate through words for this column */
+ for (int i = 0; i < count; i++) {
+ int verticalNegative = verticalNegatives[i];
+ int patternMatch = (position[i] | verticalNegative);
+ int verticalPositive = verticalPositives[i];
+ sum = (verticalPositive & patternMatch)
+ + (verticalPositive) + (sum >>> wordSize);
+ int diagonalZero = ((sum & wordMask) ^ verticalPositive)
+ | patternMatch;
+ int horizontalPositive = (verticalNegative
+ | ~(diagonalZero | verticalPositive));
+ int horizontalNegative = diagonalZero & verticalPositive;
+
+ if (i == (count - 1)) { /* only last bit in last word */
+ if ((horizontalNegative & lastBitPosition) != 0) {
+ distance--;
+ } else if ((horizontalPositive & lastBitPosition) != 0) {
+ distance++;
+ if ((maxMisses -= 2) < 0) {
+ break outer;
+ }
+ } else if (--maxMisses < 0) {
+ break outer;
+ }
+ }
+
+ horizontalPositive = ((horizontalPositive << 1)
+ | horizontalPositiveShift);
+ horizontalPositiveShift = (horizontalPositive >>> wordSize);
+
+ horizontalNegative = ((horizontalNegative << 1)
+ | horizontalNegativeShift);
+ horizontalNegativeShift = (horizontalNegative >>> wordSize);
+
+ verticalPositives[i] = (horizontalNegative
+ | ~(diagonalZero | horizontalPositive))
+ & wordMask;
+ verticalNegatives[i] = (diagonalZero & horizontalPositive) & wordMask;
+ }
+ }
+ return distance;
+ }
+
+ @Override
+ protected void perThreadInit() {
+ super.perThreadInit();
+
+ /* Allocate verticalPositive/verticalNegative arrays */
+ verticalPositivesReusable = new int[count];
+ verticalNegativesReusable = new int[count];
+ }
+ }
+
+ /*
+ * The following code is duplicated with both "int" and "long"
+ * as the word primitive type. To improve maintainability, all instances
+ * of the word type are marked with "WORD" in comments. This
+ * allows the "long" version to be regenerated from the "int" version
+ * through a mechanical replacement. To make this work, the
+ * class name also needs to include the primitive type (without
+ * altered capitalization) and the WORD marker -- this is a quite
+ * intentional violation of the usual class naming rules.
+ */
+ /**
+ * An implementation using "int" as the word size.
+ */
+ // Replace "int/*WORD*/" with "long/*WORD*/" to generate the long version.
+ static class TYPEint/*WORD*/ extends MyersBitParallelEditDistance {
+ final int/*WORD*/ lastBitPosition;
+ final int/*WORD*/[] map;
+
+ @SuppressWarnings("cast")
+ TYPEint/*WORD*/(CharSequence s) {
+ super(s);
+ /* Precompute bitmaps for this pattern */
+ map = PatternBitmap.map(s, idx, new int/*WORD*/[idx.size()]);
+ /* Compute the bit that represents a change in the last row */
+ lastBitPosition = (((int/*WORD*/)1) << (m - 1));
+ }
+
+ @Override
+ public int getDistance(CharSequence s, int k) {
+ int len = s.length();
+
+ /* Quick check based on length */
+ if (((len - m) > k) || ((m - len) > k))
+ return k + 1;
+
+ /* Map characters to their integer positions in the bitmap array */
+ indices = idx.map(s, indices);
+
+ /* Initially, vertical change is all positive (none negative) */
+ int/*WORD*/ verticalPositive = -1;
+ int/*WORD*/ verticalNegative = 0;
+ int distance = m;
+
+ /* We can only miss the "distance--" below this many times: */
+ int maxMisses = k + len - m;
+ if (maxMisses < 0) maxMisses = Integer.MAX_VALUE;
+
+ for (int j = 0; j < len; j++) {
+ /* Where is diagonal zero: matches, or prior VN; plus recursion */
+ int/*WORD*/ diagonalZero = map[indices[j]] | verticalNegative;
+ diagonalZero |= (((diagonalZero & verticalPositive) + verticalPositive)
+ ^ verticalPositive);
+
+ /* Compute horizontal changes */
+ int/*WORD*/ horizontalPositive = verticalNegative
+ | ~(diagonalZero | verticalPositive);
+ int/*WORD*/ horizontalNegative = diagonalZero & verticalPositive;
+
+ /* Update final distance based on horizontal changes */
+ if ((horizontalNegative & lastBitPosition) != 0) {
+ distance--;
+ } else if ((horizontalPositive & lastBitPosition) != 0) {
+ distance++;
+ if ((maxMisses -= 2) < 0) {
+ break;
+ }
+ } else if (--maxMisses < 0) {
+ break;
+ }
+
+ /* Shift Hs to next row, compute new Vs analagously to Hs above */
+ horizontalPositive = (horizontalPositive << 1) | 1;
+ verticalPositive = (horizontalNegative << 1)
+ | ~(diagonalZero | horizontalPositive);
+ verticalNegative = diagonalZero & horizontalPositive;
+ }
+ return distance;
+ }
+ }
+
+ /**
+ * An implementation using "long" as the word size.
+ * GENERATED MECHANICALLY FROM THE "int" VERSION ABOVE.
+ * DO NOT EDIT THIS ONE -- EDIT ABOVE AND REAPPLY QUERY/REPLACE.
+ */
+ static class TYPElong/*WORD*/ extends MyersBitParallelEditDistance {
+ final long/*WORD*/ lastBitPosition;
+ final long/*WORD*/[] map;
+
+ @SuppressWarnings("cast")
+ TYPElong/*WORD*/(CharSequence s) {
+ super(s);
+ /* Precompute bitmaps for this pattern */
+ map = PatternBitmap.map(s, idx, new long/*WORD*/[idx.size()]);
+ /* Compute the bit that represents a change in the last row */
+ lastBitPosition = (((long/*WORD*/)1) << (m - 1));
+ }
+
+ @Override
+ public int getDistance(CharSequence s, int k) {
+ int len = s.length();
+
+ /* Quick check based on length */
+ if (((len - m) > k) || ((m - len) > k))
+ return k + 1;
+
+ /* Map characters to their integer positions in the bitmap array */
+ indices = idx.map(s, indices);
+
+ /* Initially, vertical change is all positive (none negative) */
+ long/*WORD*/ verticalPositive = -1;
+ long/*WORD*/ verticalNegative = 0;
+ int distance = m;
+
+ /* We can only miss the "distance--" below this many times: */
+ int maxMisses = k + len - m;
+ if (maxMisses < 0) maxMisses = Integer.MAX_VALUE;
+
+ for (int j = 0; j < len; j++) {
+ /* Where is diagonal zero: matches, or prior VN; plus recursion */
+ long/*WORD*/ diagonalZero = map[indices[j]] | verticalNegative;
+ diagonalZero |= (((diagonalZero & verticalPositive) + verticalPositive)
+ ^ verticalPositive);
+
+ /* Compute horizontal changes */
+ long/*WORD*/ horizontalPositive = verticalNegative
+ | ~(diagonalZero | verticalPositive);
+ long/*WORD*/ horizontalNegative = diagonalZero & verticalPositive;
+
+ /* Update final distance based on horizontal changes */
+ if ((horizontalNegative & lastBitPosition) != 0) {
+ distance--;
+ } else if ((horizontalPositive & lastBitPosition) != 0) {
+ distance++;
+ if ((maxMisses -= 2) < 0) {
+ break;
+ }
+ } else if (--maxMisses < 0) {
+ break;
+ }
+
+ /* Shift Hs to next row, compute new Vs analagously to Hs above */
+ horizontalPositive = (horizontalPositive << 1) | 1;
+ verticalPositive = (horizontalNegative << 1)
+ | ~(diagonalZero | horizontalPositive);
+ verticalNegative = diagonalZero & horizontalPositive;
+ }
+ return distance;
+ }
+ }
+
+ /**
+ * Chooses an appropriate implementation for a given pattern string.
+ * @param s pattern string
+ * @return distance calculator appropriate for the pattern
+ */
+ public static MyersBitParallelEditDistance getInstance(CharSequence s) {
+ int m = s.length();
+ return (m <= Integer.SIZE) ?
+ ((m == 0) ? new Empty(s) : new TYPEint(s)) :
+ (s.length() <= Long.SIZE) ?
+ new TYPElong(s) :
+ new Multi(s);
+ }
+
+ /**
+ * Tests a computation manually
+ */
+ public static void main(String[] args) {
+ MyersBitParallelEditDistance b = getInstance(args[0]);
+ int k = args.length > 2 ? Integer.parseInt(args[2]) : 0;
+ System.out.println("Result: " + b.getDistance(args[1], k));
+ }
+
+ /**
+ * Index mapping for pattern string
+ */
+ final CharIndex idx;
+
+ /**
+ * Reusable array of indices for target strings
+ */
+ int[] indices = new int[0];
+
+ /**
+ * Length of pattern
+ */
+ final int m;
+
+ /**
+ * Constructs a distance calculator for a given string
+ */
+ protected MyersBitParallelEditDistance(CharSequence s) {
+ m = s.length();
+ idx = CharIndex.getInstance(s);
+ }
+
+ public GeneralEditDistance duplicate() {
+ try {
+ return (MyersBitParallelEditDistance)clone();
+ } catch (CloneNotSupportedException x) { /*IMPOSSIBLE */
+ throw new IllegalStateException("Cloneable object would not clone");
+ }
+ }
+
+ /**
+ * Computes distance from the pattern to a given string, bounded by
+ * a limiting distance @see(GeneralEditDistance.getDistance(CharSequence,int))
+ */
+ public abstract int getDistance(CharSequence s, int k);
+
+ @Override
+ protected Object clone() throws CloneNotSupportedException {
+ Object obj = super.clone();
+
+ /* Re-initialize any non-thread-safe parts */
+ ((MyersBitParallelEditDistance)obj).perThreadInit();
+
+ return obj;
+ }
+
+ /**
+ * Initializes items that cannot be shared among threads
+ */
+ protected void perThreadInit() {
+ indices = new int[0];
+ }
+}
diff --git a/dev/core/src/com/google/gwt/dev/util/editdistance/PatternBitmap.java b/dev/core/src/com/google/gwt/dev/util/editdistance/PatternBitmap.java
new file mode 100644
index 0000000..5ee1733
--- /dev/null
+++ b/dev/core/src/com/google/gwt/dev/util/editdistance/PatternBitmap.java
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.gwt.dev.util.editdistance;
+
+
+/**
+ * A bitmap array generator for the positions that a given character
+ * appears within a pattern. The array of bitmaps is indexed by a
+ * CharIndex that represents the alphabet for the pattern.
+ * One bitmap is produced for each character. The bitmap for
+ * a character "c" has the <tt>(1<<i)</tt> bit set for each position "i"
+ * where that character appears in the pattern.
+ *
+ * Several implementations are provided for different string
+ * sizes: int, long, int[]. The caller provides a (zeroed) array of the
+ * desired type, sized according to the CharIndex; the array is
+ * returned as a convenience.
+ */
+public class PatternBitmap {
+ /**
+ * Computes a pattern bitmap for a short character sequence.
+ * For each character in the alphabet (represented by a CharIndex),
+ * bits are set where that character appears in the sequence.
+ * For this generator, the pattern must fit in an "int": the
+ * character sequence must not be longer than Integer.SIZE.
+ * @param s the character sequence defining the bits to be set
+ * @param idx the alphabet for which bitmaps should be produced
+ * @param result an array (of size <tt>idx.size()</tt> or greater) to
+ * hold the resulting bitmaps
+ * @return the result array passed as a parameter, for convenience
+ */
+ public static int[] map(CharSequence s, CharIndex idx, int[] result) {
+ int len = s.length();
+ assert (len <= Integer.SIZE);
+ for (int i = 0; i < len; i++) {
+ result[idx.lookup(s.charAt(i))] |= (1 << i);
+ }
+ return result;
+ }
+
+ /**
+ * Computes a pattern bitmap for a character sequence of any length.
+ * For each character in the alphabet (represented by a CharIndex),
+ * bits are set where that character appears in the sequence.
+ * Each (per-character) bitmap is represented by an "int[]".
+ * @param s the character sequence defining the bits to be set
+ * @param idx the alphabet for which bitmaps should be produced
+ * @param result an array (of size <tt>idx.size()</tt> or greater) to
+ * hold the resulting bitmaps
+ * @param width how many bits to be use in each word, no more
+ * than Integer.SIZE
+ * @return the result array passed as a parameter, for convenience
+ */
+ public static int[][] map(CharSequence s, CharIndex idx,
+ int[][] result, int width) {
+ assert (width <= Integer.SIZE);
+ int len = s.length();
+ int rowSize = (len + width - 1) / width;
+
+ /*
+ * Use one zero-filled bitmap for alphabet characters not in the pattern
+ */
+ int[] nullElement = new int[rowSize];
+ java.util.Arrays.fill(result, nullElement);
+
+ int wordIndex = 0; /* Which word we are on now */
+ int bitWithinWord = 0; /* Which bit within that word */
+
+ for (int i = 0; i < s.length(); i++) {
+ int[] r = result[idx.lookup(s.charAt(i))];
+ if (r == nullElement) {
+
+ /* Create a separate zero-filled bitmap for this alphabet character */
+ r = result[idx.lookup(s.charAt(i))] = new int[rowSize];
+ }
+ r[wordIndex] |= (1 << bitWithinWord);
+
+ /* Step to the next bit (and word if appropriate) */
+ if (++bitWithinWord == width) {
+ bitWithinWord = 0;
+ wordIndex++;
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Computes a pattern bitmap for a medium character sequence.
+ * For each character in the alphabet (represented by a CharIndex),
+ * bits are set where that character appears in the sequence.
+ * For this generator, the pattern must fit in a "long": the
+ * character sequence must not be longer than Long.SIZE.
+ * @param s the character sequence defining the bits to be set
+ * @param idx the alphabet for which bitmaps should be produced
+ * @param result an array (of size <tt>idx.size()</tt> or greater) to
+ * hold the resulting bitmaps
+ * @return the result array passed as a parameter, for convenience
+ */
+ public static long[] map(CharSequence s, CharIndex idx, long[] result) {
+ int len = s.length();
+ assert (len <= Long.SIZE);
+ for (int i = 0; i < len; i++) {
+ result[idx.lookup(s.charAt(i))] |= (1L << i);
+ }
+ return result;
+ }
+ private PatternBitmap() { /* Prevent instantiation */ }
+}
diff --git a/dev/core/test/com/google/gwt/dev/util/editdistance/CharIndexTest.java b/dev/core/test/com/google/gwt/dev/util/editdistance/CharIndexTest.java
new file mode 100644
index 0000000..cdd2490
--- /dev/null
+++ b/dev/core/test/com/google/gwt/dev/util/editdistance/CharIndexTest.java
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.gwt.dev.util.editdistance;
+
+import com.google.gwt.dev.util.editdistance.CharIndex;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for {@link CharIndex}.
+ */
+public class CharIndexTest extends TestCase {
+
+ /**
+ * Tests a mapping where the masked reduction fails (where
+ * the low-order bits of several characters overlap.
+ */
+ public void testFullHash() throws Exception {
+ String string = "abc\u0001\u0101\u0201\u0301";
+ CharIndex idx = CharIndex.getInstance(string+string);
+ assertEquals(idx.getClass(), CharIndex.FullHash.class);
+ assertEquals(idx.lookup('c'), 3);
+ assertEquals(idx.nullElement(), 0);
+
+ /* because string has no duplicates: */
+ assertEquals(idx.size(), string.length() + 1);
+
+ generalVerify(idx, string, "xyz012\u0123\u1234");
+ }
+
+ /** Tests fullhash.Char overridden methods */
+ public void testFullHashChar() {
+ CharIndex.FullHash.Char x = new CharIndex.FullHash.Char();
+ x.c = 'A';
+ assertEquals("'A'", x.toString());
+ }
+
+ /**
+ * Tests a masked mapping (where characters all fall within
+ * a slice of Unicode defined by a simple mask).
+ */
+ public void testMasked() throws Exception {
+ String string = "\u0141\u0142\u0171\u0131";
+ CharIndex idx = CharIndex.getInstance(string+string);
+ assertEquals(idx.getClass(), CharIndex.Masked.class);
+ assertEquals(idx.lookup('\u0141'), 0x0041);
+ assertEquals(idx.size(), (idx.nullElement()+1));
+ generalVerify(idx, string, "xyz012\u0123\u1234");
+ }
+
+ /**
+ * Tests a straight mapping (where characters are all less
+ * than some maximum, including at least ASCII.
+ */
+ public void testStraight() throws Exception {
+ String string = "abcdandmore";
+ CharIndex idx = CharIndex.getInstance(string+string);
+ assertEquals(idx.getClass(), CharIndex.Straight.class);
+ assertEquals(idx.lookup('a'), 'a');
+ assertEquals(idx.size(), (idx.nullElement()+1));
+ generalVerify(idx, string, "xyz012\u0123\u1234");
+ }
+
+ /**
+ * Verifies abstract properties of any CharIndex
+ * @param idx index to test
+ * @param string characters in the pattern for that index
+ * @param more characters not in the pattern
+ */
+ void generalVerify(CharIndex idx, String string, String more) {
+ /* Test the map() method */
+ int[] mapped = idx.map(string, new int[0]);
+
+ for (int i = 0; i < string.length(); i++) {
+ char ci = string.charAt(i);
+ for (int j = 0; j < string.length(); j++) {
+ char cj = string.charAt(j);
+
+ /* Each character in the pattern should get a unique index */
+ assertTrue((mapped[i] == idx.lookup(cj)) == (ci == cj));
+ }
+ for (int j = 0; j < more.length(); j++) {
+ char cj = more.charAt(j);
+
+ /*
+ * Characters not in the pattern should not match any that
+ * are in the pattern (but they may match one another).
+ */
+ assertTrue(idx.lookup(ci) != idx.lookup(cj));
+ }
+ }
+ }
+}
diff --git a/dev/core/test/com/google/gwt/dev/util/editdistance/GeneralEditDistanceTest.java b/dev/core/test/com/google/gwt/dev/util/editdistance/GeneralEditDistanceTest.java
new file mode 100644
index 0000000..b0d9afe
--- /dev/null
+++ b/dev/core/test/com/google/gwt/dev/util/editdistance/GeneralEditDistanceTest.java
@@ -0,0 +1,327 @@
+/*
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.gwt.dev.util.editdistance;
+
+import com.google.gwt.dev.util.editdistance.GeneralEditDistance;
+import com.google.gwt.dev.util.editdistance.GeneralEditDistances;
+
+import junit.framework.TestCase;
+
+/**
+ * Test cases for the GeneralEditDistance class.
+ */
+public class GeneralEditDistanceTest extends TestCase {
+
+ /**
+ * A basic set of tests for any unit-cost Levenshtein edit distance
+ * implementation.
+ */
+ protected static abstract class AbstractLevenshteinTestCase extends TestCase {
+ Factory g;
+ AbstractLevenshteinTestCase(Factory g) {
+ this.g = g;
+ }
+
+ /**
+ * Tests a Levenshtein engine against the DP-based computation
+ * for a bunch of string pairs.
+ */
+ public void testLevenshteinOnWords() {
+ for (int i = 0; i < words.length; i++) {
+ for (int j = 0; j < words.length; j++) {
+ GeneralEditDistance ed = g.getInstance(words[i]);
+ specificAlgorithmVerify(ed,
+ words[i], words[j],
+ wordsDistances[i][j]);
+ }
+ }
+ }
+
+ /** Tests Levenshtein edit distance on a longer pattern */
+ public void testLongerPattern() {
+ genericLevenshteinVerify("abcdefghijklmnopqrstuvwxyz",
+ "abcefghijklMnopqrStuvwxyz..",
+ 5 /* dMS.. */);
+ }
+
+ /** Tests Levenshtein edit distance on a very short pattern */
+ public void testShortPattern() {
+ genericLevenshteinVerify("short", "shirt", 1);
+ }
+
+ /** Verifies zero-length behavior */
+ public void testZeroLengthPattern() {
+ String nonEmpty = "target";
+ genericLevenshteinVerify("", nonEmpty, nonEmpty.length());
+ genericLevenshteinVerify(nonEmpty, "", nonEmpty.length());
+ }
+
+ /** Tests the default Levenshtein engine on a pair of strings */
+ void genericLevenshteinVerify(CharSequence s1, CharSequence s2,
+ int expectedResult) {
+ specificAlgorithmVerify(g.getInstance(s1),
+ s1, s2, expectedResult);
+ }
+ }
+
+ /**
+ * A base class for other reusable tests
+ */
+ public interface Factory {
+ public GeneralEditDistance getInstance(CharSequence s1);
+ }
+
+ /** Test of the "best-of-breed" Levenshtein engine (getLevenshteinDistance) */
+ public static class GenericLevenshteinTest extends AbstractLevenshteinTestCase {
+ public GenericLevenshteinTest() {
+ super(new Factory() {
+ public GeneralEditDistance getInstance(CharSequence s1) {
+ return GeneralEditDistances.getLevenshteinDistance(s1);
+ }
+ });
+ }
+ }
+
+ /** A very large string for testing. */
+ public final static String MAGNA =
+ "We have granted to God, and by this our present Charter have " +
+ "confirmed, for Us and our Heirs for ever, that the Church of " +
+ "England shall be free, and shall have all her whole Rights and " +
+ "Liberties inviolable. We have granted also, and given to all " +
+ "the Freemen of our Realm, for Us and our Heirs for ever, these " +
+ "Liberties under-written, to have and to hold to them and their " +
+ "Heirs, of Us and our Heirs for ever.";
+
+ /**
+ * A small set of words for testing, including at least some of
+ * each of these: empty, very short, more than 32/64 character,
+ * punctuation, non-ASCII characters
+ */
+ static String[] words = { "", "a", "b", "c", "ab", "ace",
+ "fortressing", "inadequately", "prank", "authored",
+ "fortresing", "inadeqautely", "prang", "awthered",
+ "cruller's", "fanatic", "Laplace", "recollections",
+ "Kevlar", "underpays", "jalape\u00f1o","ch\u00e2telaine",
+ "kevlar", "overpaid", "jalapeno", "chatelaine",
+ "A survey of algorithms for running text search by Navarro appeared",
+ "in ACM Computing Surveys 33#1: http://portal.acm.org/citation.cfm?...",
+ "Another algorithm (Four Russians) that Navarro",
+ "long patterns and high limits was not evaluated for inclusion here.",
+ "long patterns and low limits were evaluated for inclusion here.",
+ "Filtering algorithms also improve running search",
+ "for pure edit distance."
+ };
+
+ static int[][] wordsDistances = new int[words.length][words.length];
+ static {
+ int[][] expect = wordsDistances;
+ for (int i = 0; i < words.length; i++) {
+ for (int j = 0; j < i; j++) {
+ expect[i][j] = GeneralEditDistanceTest
+ .dynamicProgrammingLevenshtein(words[i],words[j]);
+ expect[j][i] = expect[i][j];
+ }
+ }
+ }
+ /**
+ * Computes Levenshtein edit distance using the far-from-optimal
+ * dynamic programming technique. This is here purely to verify
+ * the results of better algorithms.
+ */
+ public static int dynamicProgrammingLevenshtein(String s1, String s2) {
+ int[] lastRow = new int[s1.length()+1];
+ for (int i = 0; i < lastRow.length; i++) {
+ lastRow[i] = i;
+ }
+ for (int j = 0; j < s2.length(); j++) {
+ int[] thisRow = new int[lastRow.length];
+ thisRow[0] = j + 1;
+ char s2c = s2.charAt(j);
+ for (int i = 1; i < thisRow.length; i++) {
+ thisRow[i] = Math.min(Math.min(lastRow[i]+1,thisRow[i-1]+1),
+ lastRow[i-1] + ((s2c == s1.charAt(i-1)) ? 0 : 1));
+ }
+ lastRow = thisRow;
+ }
+ return lastRow[lastRow.length-1];
+ }
+
+ /**
+ * Performs some edits on a string in a StringBuilder.
+ * @param b string to be modified in place
+ * @param alphabet some characters guaranteed not to be in the original
+ * @param replaces how many single-character replacements to try
+ * @param inserts how many characters to insert
+ * @return the number of edits actually performed
+ */
+ public static int performSomeEdits(StringBuilder b,
+ String alphabet,
+ int replaces, int inserts) {
+ java.util.Random r = new java.util.Random(768614336404564651L);
+ int edits = 0;
+
+ for (int i = 0; i < inserts; i++) {
+ int where = r.nextInt(b.length());
+ b.insert(where, alphabet.charAt(r.nextInt(alphabet.length())));
+ edits++;
+ }
+ for (int i = 0; i < replaces; i++) {
+ int where = r.nextInt(b.length());
+ if (alphabet.indexOf(b.charAt(where)) < 0) {
+ String sub = "" + alphabet.charAt(r.nextInt(alphabet.length()));
+ b.replace(where, (where + 1), sub);
+ edits++;
+ }
+ }
+ return edits;
+ }
+
+ /**
+ * Generates a long random alphabetic string,
+ * suitable for use with testSomeEdits (using digits for the alphabet).
+ * @param size desired string length
+ * @param seed random number generator seed
+ * @return random alphabetic string of the requested length
+ */
+ static String generateRandomString(int size, Long seed) {
+ char[] alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray();
+
+ /* Create a (repeatable) random string from the alphabet */
+ java.util.Random rand = new java.util.Random(seed);
+ StringBuilder huge = new StringBuilder(size);
+ for (int i = 0; i < size; i++) {
+ huge.append(alphabet[rand.nextInt(alphabet.length)]);
+ }
+ return huge.toString();
+ }
+
+ /** Exercises an edit distance engine across a wide range of limit values */
+ static void genericVerification(GeneralEditDistance ed,
+ CharSequence s1, CharSequence s2,
+ int expectedResult) {
+
+ if (s1.length() < 500) {
+ /*
+ * For small strings, try every limit
+ */
+ int max = Math.max(s1.length(), s2.length()) + 2;
+ for (int k = 0; k < max; k++) {
+ verifyResult(s1, s2, expectedResult, k,
+ ed.getDistance(s2, k));
+ }
+ } else {
+ /*
+ * For big strings, try a sampling of limits:
+ * 0 to 3,
+ * another 4 on either side of the expected result
+ * s2 length
+ */
+ for (int k = 0; k < 4; k++) {
+ verifyResult(s1, s2, expectedResult, k,
+ ed.getDistance(s2, k));
+ }
+ for (int k = Math.max(4, expectedResult - 4); k < expectedResult + 4; k++) {
+ verifyResult(s1, s2, expectedResult, k,
+ ed.getDistance(s2, k));
+ }
+ verifyResult(s1, s2, expectedResult, s2.length(),
+ ed.getDistance(s2, s2.length()));
+ }
+
+ /* Always try near MAX_VALUE */
+ assertEquals(ed.getDistance(s2, Integer.MAX_VALUE-1), expectedResult);
+ assertEquals(ed.getDistance(s2, Integer.MAX_VALUE), expectedResult);
+ }
+
+ /** Tests a specific engine on a pair of strings */
+ static void specificAlgorithmVerify(GeneralEditDistance ed,
+ CharSequence s1, CharSequence s2,
+ int expectedResult) {
+ genericVerification(ed, s1, s2, expectedResult);
+
+ /* Try again with the same instance */
+ genericVerification(ed, s1, s2, expectedResult);
+
+ /* Try again with a duplicate instance */
+ genericVerification(ed.duplicate(), s1, s2, expectedResult);
+ }
+
+ /**
+ * Verifies the distance between an original string and some
+ * number of simple edits on it. The distance is assumed to
+ * be unit-cost Levenshtein distance.
+ */
+ static void testSomeEdits(Factory g, String original,
+ int replaces, int inserts) {
+ StringBuilder modified = new StringBuilder(original);
+ int edits = performSomeEdits(modified, "0123456789", replaces, inserts);
+
+ GeneralEditDistanceTest.specificAlgorithmVerify(
+ g.getInstance(original), original, modified, edits);
+
+ GeneralEditDistanceTest.specificAlgorithmVerify(
+ g.getInstance(modified), modified, original, edits);
+
+ GeneralEditDistanceTest.specificAlgorithmVerify(
+ g.getInstance(modified).duplicate(), modified, original, edits);
+ }
+
+ /**
+ * Verifies a single edit distance result.
+ * If the expected distance is within limit, result must b
+ * be correct; otherwise, result must be over limit.
+ *
+ * @param s1 one string compared
+ * @param s2 other string compared
+ * @param expectedResult correct distance from s1 to s2
+ * @param k limit applied to computation
+ * @param d distance computed
+ */
+ static void verifyResult(CharSequence s1, CharSequence s2,
+ int expectedResult, int k, int d) {
+ if (k >= expectedResult) {
+ assertEquals("Distance from " + s1 + " to " + s2
+ + " should be " + expectedResult
+ + " (within limit=" + k + ") but was " + d,
+ expectedResult, d);
+ } else {
+ assertTrue("Distance from " + s1 + " to " + s2
+ + " should be " + expectedResult
+ + " (exceeding limit=" + k + ") but was " + d,
+ d > k);
+ }
+ }
+
+ /** Tests special case of Levenshtein edit distance with small limits */
+ public void testAtMostOneError() {
+ assertEquals(
+ GeneralEditDistances.atMostOneError("unchanged", "unchanged"),
+ 0);
+
+ StringBuilder un = new StringBuilder("un");
+ assertEquals(
+ GeneralEditDistances.atMostOneError("unchanged", un.append("changed")),
+ 0);
+
+ assertEquals(GeneralEditDistances.atMostOneError("shirt", "short"), 1);
+ assertEquals(GeneralEditDistances.atMostOneError("shirt", "shirts"), 1);
+ assertEquals(GeneralEditDistances.atMostOneError("sort", "short"), 1);
+
+ assertTrue(GeneralEditDistances.atMostOneError("short", "verylong") > 1);
+ assertTrue(GeneralEditDistances.atMostOneError("short", "sharp") > 1);
+ assertTrue(GeneralEditDistances.atMostOneError("small", "malls") > 1);
+ }
+}
diff --git a/dev/core/test/com/google/gwt/dev/util/editdistance/ModifiedBerghelRoachEditDistanceTest.java b/dev/core/test/com/google/gwt/dev/util/editdistance/ModifiedBerghelRoachEditDistanceTest.java
new file mode 100644
index 0000000..bed7b27
--- /dev/null
+++ b/dev/core/test/com/google/gwt/dev/util/editdistance/ModifiedBerghelRoachEditDistanceTest.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.gwt.dev.util.editdistance;
+
+import com.google.gwt.dev.util.editdistance.GeneralEditDistance;
+import com.google.gwt.dev.util.editdistance.ModifiedBerghelRoachEditDistance;
+import com.google.gwt.dev.util.editdistance.GeneralEditDistanceTest.AbstractLevenshteinTestCase;
+
+import static com.google.gwt.dev.util.editdistance.GeneralEditDistanceTest.MAGNA;
+import static com.google.gwt.dev.util.editdistance.GeneralEditDistanceTest.generateRandomString;
+import static com.google.gwt.dev.util.editdistance.GeneralEditDistanceTest.testSomeEdits;
+
+/**
+ * Test cases for the ModifiedBerghelRoachEditDistance class.
+ *
+ * The bulk of the test is provided by the superclass, for
+ * which we provide GeneralEditDistance instances.
+ *
+ * Since Berghel-Roach is superior for longer strings with moderately
+ * low edit distances, we try a few of those specifically.
+ * This Modified form uses less space, and can handle yet larger ones.
+ */
+public class ModifiedBerghelRoachEditDistanceTest extends junit.framework.TestCase {
+ /** Basic Levenshtein tests for ModifedBerghelRoachEditDistance */
+ public static class Basic extends AbstractLevenshteinTestCase {
+ Basic() {
+ super(new Factory());
+ }
+ }
+ private static class Factory implements GeneralEditDistanceTest.Factory {
+ public GeneralEditDistance getInstance(CharSequence s) {
+ return ModifiedBerghelRoachEditDistance.getInstance(s.toString());
+ }
+ }
+
+ static final Factory FACTORY = new Factory();
+
+ public void testHugeEdit() {
+ final int SIZE = 10000;
+ final long SEED = 1;
+
+ testSomeEdits(FACTORY, generateRandomString(SIZE, SEED), (SIZE / 50), (SIZE / 50));
+ }
+
+ public void testHugeString() {
+ /*
+ * An even larger size is feasible, but the test would no longer
+ * qualify as "small".
+ */
+ final int SIZE = 20000;
+ final long SEED = 1;
+
+ testSomeEdits(FACTORY, generateRandomString(SIZE, SEED), 30, 25);
+ }
+
+ public void testLongString() {
+ testSomeEdits(FACTORY, MAGNA, 8, 10);
+ }
+
+ public void testLongStringMoreEdits() {
+ testSomeEdits(FACTORY, MAGNA, 40, 30);
+ }
+}
diff --git a/dev/core/test/com/google/gwt/dev/util/editdistance/MyersBitParallelEditDistanceTest.java b/dev/core/test/com/google/gwt/dev/util/editdistance/MyersBitParallelEditDistanceTest.java
new file mode 100644
index 0000000..9998788
--- /dev/null
+++ b/dev/core/test/com/google/gwt/dev/util/editdistance/MyersBitParallelEditDistanceTest.java
@@ -0,0 +1,213 @@
+/*
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.gwt.dev.util.editdistance;
+
+import com.google.gwt.dev.util.editdistance.GeneralEditDistance;
+import com.google.gwt.dev.util.editdistance.MyersBitParallelEditDistance;
+
+import junit.framework.TestCase;
+
+/**
+ * Test cases for the MyersBitParallelEditDistance class.
+ *
+ * Note that the GeneralEditDistance class tests provide
+ * good general coverage of this class as well.
+ *
+ * The tests here look for boundary conditions in the
+ * specific algorithm: near 32- and 64-bit edges.
+ */
+public class MyersBitParallelEditDistanceTest extends TestCase {
+
+ /** Generates an instance (just a notational shorthand) */
+ static MyersBitParallelEditDistance generate(String pattern) {
+ return MyersBitParallelEditDistance.getInstance(pattern);
+ }
+
+ public void test32end() {
+ String s1 = "abcdefghijklmnopqrstuvwxyz012345";
+ String s2 = "abcdefghijklmnopqrstuvwxyz01234";
+ GeneralEditDistanceTest.genericVerification(generate(s1),
+ s1, s2,
+ 1);
+ GeneralEditDistanceTest.genericVerification(generate(s1),
+ s1, s2+"x",
+ 1);
+ }
+
+ public void test32start() {
+ String s1 = "abcdefghijklmnopqrstuvwxyz012345";
+ String s2 = "Abcdefghijklmnopqrstuvwxyz012345";
+ GeneralEditDistanceTest.genericVerification(generate(s1),
+ s1, s2,
+ 1);
+ GeneralEditDistanceTest.genericVerification(generate(s1),
+ s1, s2.substring(1),
+ 1);
+ }
+
+ public void test32various() {
+ String s1 = "abcdefghijklmnopqrstuvwxyz012345";
+ String s2 = "abcdeghijklmNopqrstu@vwxyz1234";
+ GeneralEditDistanceTest.genericVerification(generate(s1),
+ s1, s2,
+ 5 /*fN@05*/);
+ }
+
+ /*
+ * Some tests with strings exactly 32 characters long -- just enough
+ * for an "int" bitmap.
+ */
+
+ public void test33end() {
+ String s1 = "abcdefghijklmnopqrstuvwxyz0123456";
+ String s2 = "abcdefghijklmnopqrstuvwxyz012345";
+ GeneralEditDistanceTest.genericVerification(generate(s1),
+ s1, s2,
+ 1);
+ GeneralEditDistanceTest.genericVerification(generate(s1),
+ s1, s2+"x",
+ 1);
+ }
+
+ public void test33start() {
+ String s1 = "abcdefghijklmnopqrstuvwxyz0123456";
+ String s2 = "Abcdefghijklmnopqrstuvwxyz0123456";
+ GeneralEditDistanceTest.genericVerification(generate(s1),
+ s1, s2,
+ 1);
+ GeneralEditDistanceTest.genericVerification(generate(s1),
+ s1, s2.substring(1),
+ 1);
+ }
+
+ public void test64end() {
+ String s1 = "abcdefghijklmnopqrstuvwxyz0123456";
+ String s2 = "abcdefghijklmnopqrstuvwxyz012345";
+ GeneralEditDistanceTest.genericVerification(generate(s1+s1),
+ s1+s1, s1+s2,
+ 1);
+ GeneralEditDistanceTest.genericVerification(generate(s1+s1),
+ s1+s1, s1+s2+"x",
+ 1);
+ }
+
+ public void test64middle() {
+ String s1 = "abcdefghijklmnopqrstuvwxyz0123456";
+ String s2 = "abcdefghijklmnopqrstuvwxyz012345";
+ GeneralEditDistanceTest.genericVerification(generate(s1+s1),
+ s1+s1, s1+"x"+s2,
+ 2);
+ GeneralEditDistanceTest.genericVerification(generate(s1+s1),
+ s1+s1, s2+"x"+s1,
+ 1);
+ }
+
+ public void test64start() {
+ String s1 = "abcdefghijklmnopqrstuvwxyz0123456";
+ String s2 = "Abcdefghijklmnopqrstuvwxyz0123456";
+ GeneralEditDistanceTest.genericVerification(generate(s1+s1),
+ s1+s1, s2+s1,
+ 1);
+ GeneralEditDistanceTest.genericVerification(generate(s1),
+ s1, s2.substring(1),
+ 1);
+ }
+
+ public void testbig() {
+ String s1base = "abcdefghijklmnopqrstuvwxyz0123456";
+ String s1 = s1base + s1base + s1base + s1base + s1base;
+ String s2 = s1base
+ + "abcdeghijklmNopqrstu@vwxyz12346"
+ + s1base
+ + "bcdefGhijklmno$pqrstuvwxyz012345"
+ + s1base;
+
+ GeneralEditDistanceTest.genericVerification(generate(s1),
+ s1, s2,
+ 5+4 /*fN@05, aG$6*/);
+ }
+
+ /** Verifies the choice of bit array sizing */
+ public void testBitArraySizing() {
+ String thirtyTwo = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
+ assertEquals(generate("").getClass(),
+ MyersBitParallelEditDistance.Empty.class);
+ assertEquals(generate(thirtyTwo).getClass(),
+ MyersBitParallelEditDistance.TYPEint.class);
+ assertEquals(generate(thirtyTwo+"x").getClass(),
+ MyersBitParallelEditDistance.TYPElong.class);
+ assertEquals(generate(thirtyTwo+thirtyTwo).getClass(),
+ MyersBitParallelEditDistance.TYPElong.class);
+ assertEquals(generate(thirtyTwo+thirtyTwo+"x").getClass(),
+ MyersBitParallelEditDistance.Multi.class);
+ }
+
+ /** Tests an "impossible" exception path */
+ public void testInternalCloneNotSupportedException() {
+ try {
+ new MyersBitParallelEditDistance.Multi("not really impossible") {
+ @Override
+ public Object clone() throws CloneNotSupportedException {
+ throw new CloneNotSupportedException("do the impossible");
+ }
+ }.duplicate();
+ assertTrue("Failed to throw exception", false);
+ } catch (IllegalStateException x) {
+ /* EXPECTED RESULT */
+ }
+ }
+
+ /** Test main programs to make sure they do not die unnaturally */
+ public void testMainProgramsForSanity() {
+ MyersBitParallelEditDistance.main(new String[] { "yes", "no", "5" });
+ MyersBitParallelEditDistance.Multi.main(new String[] { "yes", "no", "5" });
+ }
+
+ /** Test on a variety of word pairs, reusing an instance appropriately */
+ public void testOnWordSet() {
+ String [] words = GeneralEditDistanceTest.words;
+ int [][] expect = GeneralEditDistanceTest.wordsDistances;
+ for (int i = 0; i < words.length; i++) {
+ GeneralEditDistance engine = generate(words[i]);
+ for (int j = 0; j <= i; j++) {
+ GeneralEditDistanceTest.genericVerification(engine,
+ words[i], words[j],
+ expect[i][j]);
+ }
+ }
+ }
+
+ /** Tests a short pattern and target */
+ public void testShort() {
+ String s1 = "short";
+ String s2 = "snorts";
+ GeneralEditDistanceTest.genericVerification(generate(s1),
+ s1, s2, 2);
+ }
+
+ /** Tests zero-length patterns and targets; distance is other length */
+ public void testZeroLength() {
+ assertEquals(0, generate("").getDistance("", 1));
+
+ String other = "other";
+ GeneralEditDistanceTest.genericVerification(generate(""),
+ "", other,
+ other.length());
+ GeneralEditDistanceTest.genericVerification(generate(other),
+ other, "",
+ other.length());
+ }
+}
diff --git a/dev/core/test/com/google/gwt/dev/util/editdistance/PatternBitmapTest.java b/dev/core/test/com/google/gwt/dev/util/editdistance/PatternBitmapTest.java
new file mode 100644
index 0000000..8941315
--- /dev/null
+++ b/dev/core/test/com/google/gwt/dev/util/editdistance/PatternBitmapTest.java
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package com.google.gwt.dev.util.editdistance;
+
+import com.google.gwt.dev.util.editdistance.CharIndex;
+import com.google.gwt.dev.util.editdistance.PatternBitmap;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for {@link PatternBitmap}.
+ */
+public class PatternBitmapTest extends TestCase {
+
+ /** Reverses a string */
+ public static String reverse(String s) {
+ StringBuilder b = new StringBuilder();
+ for (int i = s.length(); --i >= 0;) {
+ b.append(s.charAt(i));
+ }
+ return b.toString();
+ }
+
+ /**
+ * Tests the int[] form of mapping.
+ */
+ public void testInteger() throws Exception {
+ String string = "abcdandmore";
+ CharIndex idx = CharIndex.getInstance(string);
+
+ int[] map = PatternBitmap.map(string, idx, new int[idx.size()]);
+
+ /* Spot check some */
+ assertEquals(0x11, map[idx.lookup('a')]);
+ assertEquals(0x02, map[idx.lookup('b')]);
+ assertEquals(0x04, map[idx.lookup('c')]);
+ assertEquals(0x48, map[idx.lookup('d')]);
+
+ /* Check all others for zero/non-zero */
+ for (int i = 0; i < 0xffff; i++) {
+ char c = (char)i;
+ int where = string.indexOf(c);
+ if (where >= 0) {
+ assertTrue("Map for pattern character '" + c + "' should be non-zero",
+ (map[idx.lookup(c)] & (1<<where)) != 0);
+ } else {
+ assertEquals("Map for unused character '" + c + "' should be zero",
+ 0, map[idx.lookup(c)]);
+ }
+ }
+ }
+
+ /**
+ * Tests the int[][] form of mapping.
+ */
+ public void testIntegerArray() throws Exception {
+ String string = "x012345678901234567890123456789"
+ + "01234567890123456789x0123456789"
+ + "01234x567890123456789";
+ int wordSize = 31;
+
+ CharIndex idx = CharIndex.getInstance(string);
+
+ int[][] map = PatternBitmap.map(string, idx, new int[idx.size()][],
+ wordSize);
+
+ /* Spot check some */
+ assertEquals(1<<0, map[idx.lookup('x')][0]);
+ assertEquals(1<<20, map[idx.lookup('x')][1]);
+ assertEquals(1<<5, map[idx.lookup('x')][2]);
+
+ /* Check all others for null element/not */
+ int[] notThere = map[idx.lookup('\u0000')];
+ for (int i = 0; i < 0xffff; i++) {
+ char c = (char)i;
+ int where = string.indexOf(c);
+ if (where >= 0) {
+ assertTrue("Map for pattern character '" + c + "'"
+ + " should be non-zero",
+ (map[idx.lookup(c)] != notThere));
+ int bit = map[idx.lookup(c)][where/wordSize] & (1 << (where%wordSize));
+ assertTrue("Map for pattern character '" + c + "'"
+ + " should have " + where + " bit on",
+ (bit != 0));
+ } else {
+ assertEquals("Map for unused character '" + c + "' should be none",
+ notThere, map[idx.lookup(c)]);
+ }
+ }
+ }
+
+ /**
+ * Tests the long[] form of mapping.
+ */
+ public void testLong() throws Exception {
+ String string = reverse("The quick brown fox jumps over the lazy dog.");
+ CharIndex idx = CharIndex.getInstance(string);
+
+ long[] map = PatternBitmap.map(string, idx, new long[idx.size()]);
+
+ /* Spot check some */
+ long e = Long.parseLong("00100000000000000000000000001000010000000000", 2);
+ /* ............ The quick brown fox jumps over the lazy dog. */
+
+ assertEquals(e, map[idx.lookup('e')]);
+
+ long o = Long.parseLong("00000000000010000100000000100000000000000100", 2);
+ /* ............ The quick brown fox jumps over the lazy dog. */
+ assertEquals(o, map[idx.lookup('o')]);
+
+ /* Check all others for zero/non-zero */
+ for (int i = 0; i < 0xffff; i++) {
+ char c = (char)i;
+ int where = string.indexOf(c);
+ if (where >= 0) {
+ assertTrue("Map for pattern character '" + c + "' should be non-zero",
+ (map[idx.lookup(c)] & (1L<<where)) != 0);
+ } else {
+ assertEquals("Map for unused character '" + c + "' should be zero",
+ 0, map[idx.lookup(c)]);
+ }
+ }
+ }
+}