blob: 9e02ebdfe6c4b5a23f92b1efa4d41c4bac597b5f [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.gwt.dev.util.editdistance;
/**
* Computes Levenshtein string-edit distance using the
* algorithm of Eugene Myers (see "A fast bit-vector algorithm for
* approximate string matching based on dynamic progamming",
* Journal of the ACM, 46(3): 395-415, 1999), using the reformulations
* of Heikki Hyyro (see "Explaining and Extending the Bit-parallel
* Approximate String Matching Algorithm of Myers") along with
* Gonzalo Navarro ("Faster Bit-Parallel Approximate String Matching",
* Proc. 13th Combinatorial Pattern Matching (CPM'2002)).
*/
public abstract class MyersBitParallelEditDistance
implements GeneralEditDistance, Cloneable {
/*
* The Myers algorithm is based on an adaptation of the traditional
* dynamic programming algorithm, in which a matrix of distances
* is computed column-wise (or row-wise). For strings of length
* M and N, an MxN matrix (D) is filled with distances for substrings:
* D[i,j] holds the distance for string1[0..i]=>string2[0..j].
* The matrix is initially populated with the trivial values
* D[0,j]=j and D[i,0]=i; and then expanded with the rule:
* D[i,j] = min( D[i-1,j]+1, // insertion
* D[i,j-1]+1, // deletion
* (D[i-1,j-1]
* + (string1[i]==string2[j])
* ? 0 // match
* : 1 // substitution ) )
*
* The Myers algorithm takes advantage of the observation that
* the difference from one matrix position to its higher diagonal
* is either 0 or 1, and consequently row-to-row and column-to-column
* differences are 1, 0, or -1. It organizes whole columns of
* differences into bit arrays, computing new columns with
* arithmetic on whole words. The edit distance is computed
* by tracking the column changes in the last row.
*
* The description below follows the naming from the Hyrro papers above.
* The pattern string (length M) is arranged vertically; the
* string to be matched (length N) is horizontal. Columns are
* represented by bit-arrays with (1<<j) set for row i. The
* inter-cell differences are grouped into column-variables:
* VN has bits set when the transition from row i-1 to row i is
* negative; VP has bits set when it is positive; HN and HP
* have bits set for column j-1 to j transitions that are
* negative or positive, respectively. DZ has bits set when
* the diagonal from (i-1,j-1) to (i,j) is zero.
*
* The computation proceeds as follows:
* We start at column 0, where all VP bits are 1. The distance
* is accumulated in row M, with initial value M in the matrix.
*
* For each new column, we compute the diagonal-zero (DZ) first.
* This happens under three circumstances:
*
* when the pattern matches in this position ("match" above):
* We make use of a bitmask, one for each character
* in the alphabet, that has bits set wherever that character
* appears in the pattern. These bitmasks are pre-computed
* when the pattern is established (in the class constructor).
* In the column loop, we simply look up the bitmask for the
* character in this column. We call this PM.
*
* when the prior column has a negative vertical transition
* in this row ("deletion"):
* This is simply the VN bit array from the prior column.
*
* when the row above has a negative transition in this column
* ("insertion").
* This happens when the diagonal coming into the cell above
* (in *this* column) is zero but the vertical transition
* in the preceding column for that row is positive. That is:
*
* D[i-2,j-1]
* | \
* +1 0
* | \
* D[i-1,j-1] -1 -> D[i-1,j]
* | \ |
* ? 0 +1
* | \ |
* D[i,j-1] D[i,j]
*
* That is DZ[i,j] = DZ[i-1,j] & VP[i-1,j-1].
* Note that this is recursive in the DZ[*,j] bit array, but we
* want to compute the whole array at once. We need an expression
* that will propagate a 1 bit down the DZ column when the associated
* VP bit is set. Noting that the DZ[*,0] is the least significant
* bit, this means propagating to more significant bits. The carry bit
* of an addition has this behavior. If we set
* DZ = (PM | VN) // first two cases
* then we can propagate bits up iff the corresponding VP bit
* is set with:
* DZ |= (((DZ & VP) + VP) ^ VP);
* The (DZ&VP) piece covers our required condition on VP. Adding
* the VP bits causes any 1 bits from (DZ&VP) to result in a carry,
* but also introduces a 1 bit where VP=1 but DZ=0, which are then
* cleared with the ^VP piece. The propagated bits are |ed into the
* base case to get a final result.
*
* Once we have DZ, we can compute HP and HN. Note that VP and VN have
* not yet been updated, and so refer to the preceding column; DZ refers
* to the current column.
* HN is true when DZ is true in this column but VP was true
* in the prior column (HN = DZ & VP)
* HP is true when: the prior VN is true (otherwise, the diagonal would
* be negative, and that cannot happen); or, DZ is false (meaning
* the diagonal increased by 1) and the prior-column VP is also false
* (HP = VN | ~(DZ|VP))
*
* Once we have HP and HN, we can tally the effect on the last row (adding
* or subtracting one, respectively).
*
* Finally, we can compute VP and VN for this column, in preparation
* for the next round. These computations are analagous to the ones for
* HN and HP above, except that there, the incoming V values already
* reflected the prior column and DZ this one, but now the H and DZ values
* reflect the same column. So, we first shift the H values to associate
* with the proper bits from DZ.
*/
/**
* A trivial implementation for the zero-length string
*/
static class Empty extends MyersBitParallelEditDistance {
Empty(CharSequence s) {
super(s);
}
@Override
public GeneralEditDistance duplicate() {
return this; /* thread-safe */
}
@Override
public int getDistance(CharSequence s, int k) {
return s.length();
}
}
/**
* An implementation using multiple integers for each column.
* This follows the pattern for the single-word implementation, with
* these changes:
* sums/shifts must propagate from one word to the next;
* to make that easier (and cheaper), we use only (SIZE-1) bits per word;
*/
static class Multi extends MyersBitParallelEditDistance {
/* How many integers we use per column */
int count;
/**
* Where the last-row bit lives -- only in the last array slot, though
*/
final int lastBitPosition;
/**
* Bitmaps for pattern string: [pattern_index][column_index]
*/
final int[][] positions;
int[] verticalNegativesReusable;
/* Reusable arrays for vertical changes */
int[] verticalPositivesReusable;
/**
* A mask with those bits set
*/
final int wordMask = (-1 >>> 1);
/**
* How many bits we use per word
*/
final int wordSize = Integer.SIZE - 1;
/**
* Constructs a multi-word engine
*/
Multi(CharSequence s) {
super(s);
/* Compute number of words to use (rounding up) */
count = (m + wordSize - 1) / wordSize;
/* Precompute bitmaps for pattern string */
positions = PatternBitmap.map(s, idx, new int [idx.size()][], wordSize);
/* Where in last word does last row appear */
lastBitPosition = (1 << ((m - 1) % wordSize));
/* Initialize scratchpad items */
perThreadInit();
}
@Override
public int getDistance(CharSequence s, int k) {
indices = idx.map(s, indices);
/* Initialize verticalPositive to all bits on, verticalNegative all off */
int[] verticalPositives = verticalPositivesReusable;
java.util.Arrays.fill(verticalPositives, wordMask);
int[] verticalNegatives = verticalNegativesReusable;
java.util.Arrays.fill(verticalNegatives, 0);
int distance = m;
int len = s.length();
/* We can only miss the distance-- below this many times: */
int maxMisses = k + len - m;
if (maxMisses < 0) maxMisses = Integer.MAX_VALUE;
outer:
for (int j = 0; j < len; j++) {
int[] position = positions[indices[j]];
/* Carry bits from one word to the next */
int sum = 0;
int horizontalPositiveShift = 1;
int horizontalNegativeShift = 0;
/* Iterate through words for this column */
for (int i = 0; i < count; i++) {
int verticalNegative = verticalNegatives[i];
int patternMatch = (position[i] | verticalNegative);
int verticalPositive = verticalPositives[i];
sum = (verticalPositive & patternMatch)
+ (verticalPositive) + (sum >>> wordSize);
int diagonalZero = ((sum & wordMask) ^ verticalPositive)
| patternMatch;
int horizontalPositive = (verticalNegative
| ~(diagonalZero | verticalPositive));
int horizontalNegative = diagonalZero & verticalPositive;
if (i == (count - 1)) { /* only last bit in last word */
if ((horizontalNegative & lastBitPosition) != 0) {
distance--;
} else if ((horizontalPositive & lastBitPosition) != 0) {
distance++;
if ((maxMisses -= 2) < 0) {
break outer;
}
} else if (--maxMisses < 0) {
break outer;
}
}
horizontalPositive = ((horizontalPositive << 1)
| horizontalPositiveShift);
horizontalPositiveShift = (horizontalPositive >>> wordSize);
horizontalNegative = ((horizontalNegative << 1)
| horizontalNegativeShift);
horizontalNegativeShift = (horizontalNegative >>> wordSize);
verticalPositives[i] = (horizontalNegative
| ~(diagonalZero | horizontalPositive))
& wordMask;
verticalNegatives[i] = (diagonalZero & horizontalPositive) & wordMask;
}
}
return distance;
}
@Override
protected void perThreadInit() {
super.perThreadInit();
/* Allocate verticalPositive/verticalNegative arrays */
verticalPositivesReusable = new int[count];
verticalNegativesReusable = new int[count];
}
}
/*
* The following code is duplicated with both "int" and "long"
* as the word primitive type. To improve maintainability, all instances
* of the word type are marked with "WORD" in comments. This
* allows the "long" version to be regenerated from the "int" version
* through a mechanical replacement. To make this work, the
* class name also needs to include the primitive type (without
* altered capitalization) and the WORD marker -- this is a quite
* intentional violation of the usual class naming rules.
*/
/**
* An implementation using "int" as the word size.
*/
// Replace "int/*WORD*/" with "long/*WORD*/" to generate the long version.
static class TYPEint/*WORD*/ extends MyersBitParallelEditDistance {
final int/*WORD*/ lastBitPosition;
final int/*WORD*/[] map;
@SuppressWarnings("cast")
TYPEint/*WORD*/(CharSequence s) {
super(s);
/* Precompute bitmaps for this pattern */
map = PatternBitmap.map(s, idx, new int/*WORD*/[idx.size()]);
/* Compute the bit that represents a change in the last row */
lastBitPosition = (((int/*WORD*/)1) << (m - 1));
}
@Override
public int getDistance(CharSequence s, int k) {
int len = s.length();
/* Quick check based on length */
if (((len - m) > k) || ((m - len) > k))
return k + 1;
/* Map characters to their integer positions in the bitmap array */
indices = idx.map(s, indices);
/* Initially, vertical change is all positive (none negative) */
int/*WORD*/ verticalPositive = -1;
int/*WORD*/ verticalNegative = 0;
int distance = m;
/* We can only miss the "distance--" below this many times: */
int maxMisses = k + len - m;
if (maxMisses < 0) maxMisses = Integer.MAX_VALUE;
for (int j = 0; j < len; j++) {
/* Where is diagonal zero: matches, or prior VN; plus recursion */
int/*WORD*/ diagonalZero = map[indices[j]] | verticalNegative;
diagonalZero |= (((diagonalZero & verticalPositive) + verticalPositive)
^ verticalPositive);
/* Compute horizontal changes */
int/*WORD*/ horizontalPositive = verticalNegative
| ~(diagonalZero | verticalPositive);
int/*WORD*/ horizontalNegative = diagonalZero & verticalPositive;
/* Update final distance based on horizontal changes */
if ((horizontalNegative & lastBitPosition) != 0) {
distance--;
} else if ((horizontalPositive & lastBitPosition) != 0) {
distance++;
if ((maxMisses -= 2) < 0) {
break;
}
} else if (--maxMisses < 0) {
break;
}
/* Shift Hs to next row, compute new Vs analagously to Hs above */
horizontalPositive = (horizontalPositive << 1) | 1;
verticalPositive = (horizontalNegative << 1)
| ~(diagonalZero | horizontalPositive);
verticalNegative = diagonalZero & horizontalPositive;
}
return distance;
}
}
/**
* An implementation using "long" as the word size.
* GENERATED MECHANICALLY FROM THE "int" VERSION ABOVE.
* DO NOT EDIT THIS ONE -- EDIT ABOVE AND REAPPLY QUERY/REPLACE.
*/
static class TYPElong/*WORD*/ extends MyersBitParallelEditDistance {
final long/*WORD*/ lastBitPosition;
final long/*WORD*/[] map;
@SuppressWarnings("cast")
TYPElong/*WORD*/(CharSequence s) {
super(s);
/* Precompute bitmaps for this pattern */
map = PatternBitmap.map(s, idx, new long/*WORD*/[idx.size()]);
/* Compute the bit that represents a change in the last row */
lastBitPosition = (((long/*WORD*/)1) << (m - 1));
}
@Override
public int getDistance(CharSequence s, int k) {
int len = s.length();
/* Quick check based on length */
if (((len - m) > k) || ((m - len) > k))
return k + 1;
/* Map characters to their integer positions in the bitmap array */
indices = idx.map(s, indices);
/* Initially, vertical change is all positive (none negative) */
long/*WORD*/ verticalPositive = -1;
long/*WORD*/ verticalNegative = 0;
int distance = m;
/* We can only miss the "distance--" below this many times: */
int maxMisses = k + len - m;
if (maxMisses < 0) maxMisses = Integer.MAX_VALUE;
for (int j = 0; j < len; j++) {
/* Where is diagonal zero: matches, or prior VN; plus recursion */
long/*WORD*/ diagonalZero = map[indices[j]] | verticalNegative;
diagonalZero |= (((diagonalZero & verticalPositive) + verticalPositive)
^ verticalPositive);
/* Compute horizontal changes */
long/*WORD*/ horizontalPositive = verticalNegative
| ~(diagonalZero | verticalPositive);
long/*WORD*/ horizontalNegative = diagonalZero & verticalPositive;
/* Update final distance based on horizontal changes */
if ((horizontalNegative & lastBitPosition) != 0) {
distance--;
} else if ((horizontalPositive & lastBitPosition) != 0) {
distance++;
if ((maxMisses -= 2) < 0) {
break;
}
} else if (--maxMisses < 0) {
break;
}
/* Shift Hs to next row, compute new Vs analagously to Hs above */
horizontalPositive = (horizontalPositive << 1) | 1;
verticalPositive = (horizontalNegative << 1)
| ~(diagonalZero | horizontalPositive);
verticalNegative = diagonalZero & horizontalPositive;
}
return distance;
}
}
/**
* Chooses an appropriate implementation for a given pattern string.
* @param s pattern string
* @return distance calculator appropriate for the pattern
*/
public static MyersBitParallelEditDistance getInstance(CharSequence s) {
int m = s.length();
return (m <= Integer.SIZE) ?
((m == 0) ? new Empty(s) : new TYPEint(s)) :
(s.length() <= Long.SIZE) ?
new TYPElong(s) :
new Multi(s);
}
/**
* Tests a computation manually
*/
public static void main(String[] args) {
MyersBitParallelEditDistance b = getInstance(args[0]);
int k = args.length > 2 ? Integer.parseInt(args[2]) : 0;
System.out.println("Result: " + b.getDistance(args[1], k));
}
/**
* Index mapping for pattern string
*/
final CharIndex idx;
/**
* Reusable array of indices for target strings
*/
int[] indices = new int[0];
/**
* Length of pattern
*/
final int m;
/**
* Constructs a distance calculator for a given string
*/
protected MyersBitParallelEditDistance(CharSequence s) {
m = s.length();
idx = CharIndex.getInstance(s);
}
public GeneralEditDistance duplicate() {
try {
return (MyersBitParallelEditDistance)clone();
} catch (CloneNotSupportedException x) { /*IMPOSSIBLE */
throw new IllegalStateException("Cloneable object would not clone");
}
}
/**
* Computes distance from the pattern to a given string, bounded by
* a limiting distance @see(GeneralEditDistance.getDistance(CharSequence,int))
*/
public abstract int getDistance(CharSequence s, int k);
@Override
protected Object clone() throws CloneNotSupportedException {
Object obj = super.clone();
/* Re-initialize any non-thread-safe parts */
((MyersBitParallelEditDistance)obj).perThreadInit();
return obj;
}
/**
* Initializes items that cannot be shared among threads
*/
protected void perThreadInit() {
indices = new int[0];
}
}