| /* |
| * Copyright 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| * use this file except in compliance with the License. You may obtain a copy of |
| * the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| * License for the specific language governing permissions and limitations under |
| * the License. |
| */ |
| package com.google.gwt.dev.util.editdistance; |
| |
| /** |
| * A performance-oriented character-indexed map. |
| * Provides a mapping from characters in a limited alphabet |
| * (defined by a CharSequence) to integer values (for array indexing). |
| * Emphasis is on performance (e.g., compared to using a generic HashMap), |
| * in terms of speed of (post-construction) lookup and space required. |
| * |
| * Maps are constructed using a static <tt>getInstance</tt> method that |
| * chooses an efficient implementation for a given string. |
| * The map interface consists of: |
| * a <tt>size</tt> method for determining the maximum range of indices used; |
| * a <tt>lookup</tt> method for translating a character to its index; and, |
| * a <tt>nullElement</tt> method that returns the index that is not |
| * returned by <tt>lookup</tt> for any characer actually in the map. |
| * |
| * The target application is the mapping of natural-language strings. |
| * Most strings include characters from a single language, and thus |
| * fall into a small slice of the Unicode space. In particular, |
| * many strings fall into just the Latin series. |
| */ |
| public abstract class CharIndex { |
| /** |
| * An index based on a garden-variety hash map. |
| * This is the implementation of last recourse (from a |
| * performance perspective). |
| * |
| * TODO(mwyoung): there may be value in an implementation |
| * that falls in between the masked and fullhash variants; |
| * for example, one using a closed hash table with rehashing. |
| */ |
| public static class FullHash extends CharIndex { |
| /** |
| * Mutable holder for a character. |
| */ |
| static class Char { |
| char c; |
| @Override |
| public boolean equals(Object x) { |
| return (x != null) && (((Char) x).c == this.c); |
| } |
| @Override |
| public int hashCode() { |
| return c; |
| } |
| @Override |
| public String toString() { |
| return "'" + c + "'"; |
| } |
| } |
| |
| static final int NULL_ELEMENT = 0; |
| protected int lastUsed = NULL_ELEMENT; |
| /** |
| * Mapping from pattern characters to their integer index values. |
| */ |
| final java.util.HashMap<Char,Integer> map; |
| |
| /** |
| * Constructs a full hash-based mapping. |
| */ |
| FullHash(CharSequence s) { |
| /* Choose a hash size larger at least twice the string length */ |
| int len = s.length(); |
| int power = Integer.highestOneBit(len); |
| |
| /* Create the map */ |
| map = new java.util.HashMap<Char,Integer>( |
| power << ((power == len) ? 1 : 2)); |
| |
| /* |
| * Add characters one at a time to the map |
| */ |
| |
| Char test = new Char(); /* (re)used for lookup */ |
| for (int i = 0; i < s.length(); i++) { |
| test.c = s.charAt(i); |
| if (map.get(test) == null) { |
| /* Not there... add it */ |
| map.put(test, new Integer(++lastUsed)); |
| |
| /* Grow a new holder for subsequent lookups */ |
| test = new Char(); |
| } |
| } |
| } |
| |
| @Override |
| public int lookup(char c) { |
| final Char lookupTest = new Char(); |
| lookupTest.c = c; |
| Integer result = map.get(lookupTest); |
| return (result != null) ? result.intValue() : 0; |
| } |
| |
| @Override |
| public int[] map(CharSequence s, int[] mapped) { |
| /* Create one mutable Char, and reuse it for all lookups */ |
| final Char lookupTest = new Char(); |
| |
| int len = s.length(); |
| if (mapped.length < len) { |
| mapped = new int[len]; |
| } |
| |
| for (int i = 0; i < len; i++) { |
| lookupTest.c = s.charAt(i); |
| Integer result = map.get(lookupTest); |
| mapped[i] = (result != null) ? result.intValue() : NULL_ELEMENT; |
| } |
| return mapped; |
| } |
| |
| @Override |
| public int nullElement() { |
| return NULL_ELEMENT; |
| } |
| |
| @Override |
| public int size() { |
| return lastUsed + 1; |
| } |
| } |
| |
| /** |
| * An index based on a simple mask: index(c) = (c & MASK). |
| * This allows for a very fast index function for many |
| * languages outside of the ASCII spectrum. Most languages |
| * fit in a single 8-bit Unicode slice. Even languages |
| * that cross slices for special characters (such as extended |
| * Latin) often have no collisions under simple masking. |
| */ |
| public static class Masked extends CharIndex { |
| /** |
| * Hash table size. |
| */ |
| static final int SIZE = 0x100; |
| |
| /** |
| * Mask used for hashing. |
| */ |
| static final int MASK = (SIZE - 1); |
| |
| /** |
| * Where we may invalid characters: beyond the hash table. |
| */ |
| static final int NULL_ELEMENT = SIZE; |
| |
| /** |
| * Generates an instance of this implementation if possible. |
| * @param s pattern string |
| * @return an instance of this CharIndex if the pattern satisfies |
| * the constraints for this implementation; otherwise, null |
| */ |
| static Masked generate(CharSequence s) { |
| /* |
| * This implementation requires that there be no hash collisions |
| * among the pattern characters, using a simple mask-based hash. |
| */ |
| char [] contains = new char[SIZE]; |
| |
| /* Ensure that for all x, hash(contains[x]) != x initially. */ |
| contains[0] = (char) 1; |
| |
| /* Hash characters, recording values seen, rejecting collisions */ |
| for (int i = 0; i < s.length(); i++) { |
| char c = s.charAt(i); |
| int index = c & MASK; |
| if (contains[index] != c) { |
| if ((contains[index] & MASK) == index) { |
| return null; |
| } |
| contains[index] = c; |
| } |
| } |
| return new Masked(contains); |
| } |
| |
| /** |
| * Closed hash table: characters actually indexed. |
| * Unused cells contain values inappropriate to their index. |
| */ |
| final char[] contains; |
| |
| /** |
| * Constructor based on hash table built by generate(). |
| */ |
| private Masked(char [] contains) { |
| this.contains = contains; |
| } |
| |
| @Override |
| public int lookup(char c) { |
| int index = c & MASK; |
| return (c == contains[index]) ? index : NULL_ELEMENT; |
| } |
| |
| @Override |
| public int[] map(CharSequence s, int[] mapped) { |
| int len = s.length(); |
| if (mapped.length < len) { |
| mapped = new int[len]; |
| } |
| for (int i = 0; i < len; i++) { |
| char c = s.charAt(i); |
| int index = c & MASK; |
| mapped[i] = (c == contains[index]) ? index : NULL_ELEMENT; |
| } |
| return mapped; |
| } |
| |
| @Override |
| public int nullElement() { return NULL_ELEMENT; } |
| |
| @Override |
| public int size() { return NULL_ELEMENT + 1; } |
| } |
| |
| /** |
| * An index based on the identity mapping: index(c) = c. |
| * This requires minimal computation and no additional space |
| * for the basic ASCII character set. |
| */ |
| public static class Straight extends CharIndex { |
| /** |
| * The largest character we will map (directly). |
| */ |
| static final int MAX = 0x80; |
| |
| /** |
| * A mask used to find characters that fall outside. |
| */ |
| static final int MASK = ~(MAX - 1); |
| |
| /** |
| * A map result we never generate for valid characters. |
| */ |
| static final int NULL_ELEMENT = MAX; |
| |
| /** |
| * Generates an instance of this implementation if possible. |
| * @param s pattern string |
| * @return an instance of this CharIndex if the pattern satisfies |
| * the constraints for this implementation; otherwise, null |
| */ |
| static Straight generate(CharSequence s) { |
| /* This implementation requires that all characters fall below MAX */ |
| for (int i = 0; i < s.length(); i++) { |
| if ((s.charAt(i) & MASK) != 0) { |
| return null; |
| } |
| } |
| |
| return new Straight(); |
| } |
| |
| /** |
| * Simple private constructor, no state required. |
| */ |
| private Straight() { } |
| |
| @Override |
| public int lookup(char c) { |
| return ((c & MASK) == 0) ? c : NULL_ELEMENT; |
| } |
| |
| @Override |
| public int[] map(CharSequence s, int[] mapped) { |
| int len = s.length(); |
| if (mapped.length < len) { |
| mapped = new int[len]; |
| } |
| for (int i = 0; i < len; i++) { |
| char c = s.charAt(i); |
| mapped[i] = ((c & MASK) == 0) ? c : NULL_ELEMENT; |
| } |
| return mapped; |
| } |
| |
| @Override |
| public int nullElement() { |
| return NULL_ELEMENT; |
| } |
| |
| @Override |
| public int size() { |
| return NULL_ELEMENT + 1; |
| } |
| } |
| |
| /** |
| * Generates an index for a given alphabet, defined by a character sequence. |
| * An appropriate implementation is chosen for the alphabet. The |
| * character sequence is allowed to contain multiple instances of |
| * the same character. |
| * |
| * @param s the alphabet to be indexed |
| * @return an index appropriate to this alphabet |
| */ |
| public static CharIndex getInstance(CharSequence s) { |
| CharIndex result; |
| |
| /* Try fastest mappings first, then more general ones */ |
| |
| if ((result = Straight.generate(s)) != null) { |
| return result; |
| } |
| |
| if ((result = Masked.generate(s)) != null) { |
| return result; |
| } |
| |
| /* Full hash always works */ |
| return new FullHash(s); |
| } |
| |
| /** |
| * Returns the index (mapping result) for a given character. |
| * If the character is present in the alphabet, then a unique index |
| * is returned; other characters may be mapped to unique |
| * values, *or* may be mapped to a shared <tt>nullElement</tt> value. |
| * @param c character for which an index is sought |
| * @return an integer index for the character |
| */ |
| public abstract int lookup(char c); |
| |
| /** |
| * Performs lookups on an entire sequence of characters. |
| * This allows an entire string to be mapped with fewer |
| * method invocations, and possibly with some added internal |
| * efficiencies. |
| * |
| * @param s character sequence to be indexed |
| * @param mapped array for storing results |
| * @return an array of indices for the character sequence, |
| * either the <tt>mapped</tt> parameter or a larger newly-allocated |
| * array if required |
| */ |
| public abstract int[] map(CharSequence s, int[] mapped); |
| |
| /** |
| * Returns an index that is not used for any character in |
| * the alphabet. This index *may* be returned for characters |
| * not in the alphabet. |
| * @return an index not used for a valid character |
| */ |
| public abstract int nullElement(); |
| |
| /** |
| * Returns the maximum index result that can be returned by |
| * the <tt>lookup</tt> function (including the <tt>nullElement</tt> result |
| * if applicable). No negative values are ever returned |
| * from <tt>lookup</tt>. |
| * @return maximum index returned from any <tt>lookup</tt> |
| */ |
| public abstract int size(); |
| } |