blob: 3e76d4dfb73f825a2c2a5c7265e0241e8ee26c12 [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.gwt.dev.util.editdistance;
/**
* A performance-oriented character-indexed map.
* Provides a mapping from characters in a limited alphabet
* (defined by a CharSequence) to integer values (for array indexing).
* Emphasis is on performance (e.g., compared to using a generic HashMap),
* in terms of speed of (post-construction) lookup and space required.
*
* Maps are constructed using a static <tt>getInstance</tt> method that
* chooses an efficient implementation for a given string.
* The map interface consists of:
* a <tt>size</tt> method for determining the maximum range of indices used;
* a <tt>lookup</tt> method for translating a character to its index; and,
* a <tt>nullElement</tt> method that returns the index that is not
* returned by <tt>lookup</tt> for any characer actually in the map.
*
* The target application is the mapping of natural-language strings.
* Most strings include characters from a single language, and thus
* fall into a small slice of the Unicode space. In particular,
* many strings fall into just the Latin series.
*/
public abstract class CharIndex {
/**
* An index based on a garden-variety hash map.
* This is the implementation of last recourse (from a
* performance perspective).
*
* TODO(mwyoung): there may be value in an implementation
* that falls in between the masked and fullhash variants;
* for example, one using a closed hash table with rehashing.
*/
public static class FullHash extends CharIndex {
/**
* Mutable holder for a character.
*/
static class Char {
char c;
@Override
public boolean equals(Object x) {
return (x != null) && (((Char) x).c == this.c);
}
@Override
public int hashCode() {
return c;
}
@Override
public String toString() {
return "'" + c + "'";
}
}
static final int NULL_ELEMENT = 0;
protected int lastUsed = NULL_ELEMENT;
/**
* Mapping from pattern characters to their integer index values.
*/
final java.util.HashMap<Char,Integer> map;
/**
* Constructs a full hash-based mapping.
*/
FullHash(CharSequence s) {
/* Choose a hash size larger at least twice the string length */
int len = s.length();
int power = Integer.highestOneBit(len);
/* Create the map */
map = new java.util.HashMap<Char,Integer>(
power << ((power == len) ? 1 : 2));
/*
* Add characters one at a time to the map
*/
Char test = new Char(); /* (re)used for lookup */
for (int i = 0; i < s.length(); i++) {
test.c = s.charAt(i);
if (map.get(test) == null) {
/* Not there... add it */
map.put(test, new Integer(++lastUsed));
/* Grow a new holder for subsequent lookups */
test = new Char();
}
}
}
@Override
public int lookup(char c) {
final Char lookupTest = new Char();
lookupTest.c = c;
Integer result = map.get(lookupTest);
return (result != null) ? result.intValue() : 0;
}
@Override
public int[] map(CharSequence s, int[] mapped) {
/* Create one mutable Char, and reuse it for all lookups */
final Char lookupTest = new Char();
int len = s.length();
if (mapped.length < len) {
mapped = new int[len];
}
for (int i = 0; i < len; i++) {
lookupTest.c = s.charAt(i);
Integer result = map.get(lookupTest);
mapped[i] = (result != null) ? result.intValue() : NULL_ELEMENT;
}
return mapped;
}
@Override
public int nullElement() {
return NULL_ELEMENT;
}
@Override
public int size() {
return lastUsed + 1;
}
}
/**
* An index based on a simple mask: index(c) = (c & MASK).
* This allows for a very fast index function for many
* languages outside of the ASCII spectrum. Most languages
* fit in a single 8-bit Unicode slice. Even languages
* that cross slices for special characters (such as extended
* Latin) often have no collisions under simple masking.
*/
public static class Masked extends CharIndex {
/**
* Hash table size.
*/
static final int SIZE = 0x100;
/**
* Mask used for hashing.
*/
static final int MASK = (SIZE - 1);
/**
* Where we may invalid characters: beyond the hash table.
*/
static final int NULL_ELEMENT = SIZE;
/**
* Generates an instance of this implementation if possible.
* @param s pattern string
* @return an instance of this CharIndex if the pattern satisfies
* the constraints for this implementation; otherwise, null
*/
static Masked generate(CharSequence s) {
/*
* This implementation requires that there be no hash collisions
* among the pattern characters, using a simple mask-based hash.
*/
char [] contains = new char[SIZE];
/* Ensure that for all x, hash(contains[x]) != x initially. */
contains[0] = (char) 1;
/* Hash characters, recording values seen, rejecting collisions */
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
int index = c & MASK;
if (contains[index] != c) {
if ((contains[index] & MASK) == index) {
return null;
}
contains[index] = c;
}
}
return new Masked(contains);
}
/**
* Closed hash table: characters actually indexed.
* Unused cells contain values inappropriate to their index.
*/
final char[] contains;
/**
* Constructor based on hash table built by generate().
*/
private Masked(char [] contains) {
this.contains = contains;
}
@Override
public int lookup(char c) {
int index = c & MASK;
return (c == contains[index]) ? index : NULL_ELEMENT;
}
@Override
public int[] map(CharSequence s, int[] mapped) {
int len = s.length();
if (mapped.length < len) {
mapped = new int[len];
}
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
int index = c & MASK;
mapped[i] = (c == contains[index]) ? index : NULL_ELEMENT;
}
return mapped;
}
@Override
public int nullElement() { return NULL_ELEMENT; }
@Override
public int size() { return NULL_ELEMENT + 1; }
}
/**
* An index based on the identity mapping: index(c) = c.
* This requires minimal computation and no additional space
* for the basic ASCII character set.
*/
public static class Straight extends CharIndex {
/**
* The largest character we will map (directly).
*/
static final int MAX = 0x80;
/**
* A mask used to find characters that fall outside.
*/
static final int MASK = ~(MAX - 1);
/**
* A map result we never generate for valid characters.
*/
static final int NULL_ELEMENT = MAX;
/**
* Generates an instance of this implementation if possible.
* @param s pattern string
* @return an instance of this CharIndex if the pattern satisfies
* the constraints for this implementation; otherwise, null
*/
static Straight generate(CharSequence s) {
/* This implementation requires that all characters fall below MAX */
for (int i = 0; i < s.length(); i++) {
if ((s.charAt(i) & MASK) != 0) {
return null;
}
}
return new Straight();
}
/**
* Simple private constructor, no state required.
*/
private Straight() { }
@Override
public int lookup(char c) {
return ((c & MASK) == 0) ? c : NULL_ELEMENT;
}
@Override
public int[] map(CharSequence s, int[] mapped) {
int len = s.length();
if (mapped.length < len) {
mapped = new int[len];
}
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
mapped[i] = ((c & MASK) == 0) ? c : NULL_ELEMENT;
}
return mapped;
}
@Override
public int nullElement() {
return NULL_ELEMENT;
}
@Override
public int size() {
return NULL_ELEMENT + 1;
}
}
/**
* Generates an index for a given alphabet, defined by a character sequence.
* An appropriate implementation is chosen for the alphabet. The
* character sequence is allowed to contain multiple instances of
* the same character.
*
* @param s the alphabet to be indexed
* @return an index appropriate to this alphabet
*/
public static CharIndex getInstance(CharSequence s) {
CharIndex result;
/* Try fastest mappings first, then more general ones */
if ((result = Straight.generate(s)) != null) {
return result;
}
if ((result = Masked.generate(s)) != null) {
return result;
}
/* Full hash always works */
return new FullHash(s);
}
/**
* Returns the index (mapping result) for a given character.
* If the character is present in the alphabet, then a unique index
* is returned; other characters may be mapped to unique
* values, *or* may be mapped to a shared <tt>nullElement</tt> value.
* @param c character for which an index is sought
* @return an integer index for the character
*/
public abstract int lookup(char c);
/**
* Performs lookups on an entire sequence of characters.
* This allows an entire string to be mapped with fewer
* method invocations, and possibly with some added internal
* efficiencies.
*
* @param s character sequence to be indexed
* @param mapped array for storing results
* @return an array of indices for the character sequence,
* either the <tt>mapped</tt> parameter or a larger newly-allocated
* array if required
*/
public abstract int[] map(CharSequence s, int[] mapped);
/**
* Returns an index that is not used for any character in
* the alphabet. This index *may* be returned for characters
* not in the alphabet.
* @return an index not used for a valid character
*/
public abstract int nullElement();
/**
* Returns the maximum index result that can be returned by
* the <tt>lookup</tt> function (including the <tt>nullElement</tt> result
* if applicable). No negative values are ever returned
* from <tt>lookup</tt>.
* @return maximum index returned from any <tt>lookup</tt>
*/
public abstract int size();
}