| /* |
| * Copyright 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| * use this file except in compliance with the License. You may obtain a copy of |
| * the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| * License for the specific language governing permissions and limitations under |
| * the License. |
| */ |
| package com.google.gwt.dev.util.editdistance; |
| |
| /** |
| * A collection of instance generators for the GeneralEditDistance interface. |
| */ |
| public class GeneralEditDistances { |
| /** |
| * Chooses the best implementation of Levenshtein string edit distance |
| * available at the current time. |
| */ |
| /* |
| * As of 2007-08-23, the best algorithm known (to the author=mwyoung) for |
| * short strings is one due to Eugene Myers, except for the special case |
| * where the distance limit is 0 or 1. The Myers algorithm also has good |
| * worst-case performance for long strings when the edit distance is not |
| * reasonably bounded. |
| * |
| * When there is a good bound, a variant of the Ukkonen algorithm due to |
| * Berghel and Roach (modified by Michael Young to use linear space) |
| * is faster for long strings. |
| * |
| * Note that other algorithms that perform better in some cases for running |
| * text searches do not outperform Myers for rigid distance computations. |
| * Notably: |
| * Navarro/Baeza-Yates (Algorithmica 23,2) simulates an NFA with an |
| * epsilon-cycle on the initial state (appropriate for running texts) |
| * and reports success without computing exact distance. When adjusted |
| * to a fixed starting point and computing distance, its state machine |
| * is larger and it underperforms. |
| * |
| * BITAP (Baeza-Yates/Gonnet, Manber/Wu) also simulates an NFA, and |
| * Navarro claims that it wins for small patterns and small limits for |
| * running search. Experiments with a Java implementation showed that |
| * it beat Myers on pure string edit distance only for limits where the |
| * special 0-1 limit applied, where special-case comparison beats all. |
| * |
| * A survey of algorithms for running text search by Navarro appeared |
| * in ACM Computing Surveys 33#1: http://portal.acm.org/citation.cfm?id=375365 |
| * Another algorithm (Four Russians) that Navarro claims superior for very |
| * long patterns and high limits was not evaluated for inclusion here. |
| * Filtering algorithms also improve running search, but do not help |
| * for pure edit distance. |
| */ |
| private static class Levenshtein implements GeneralEditDistance { |
| /** |
| * Long+bounded implementation class: distance-only Berghel-Roach. |
| */ |
| private ModifiedBerghelRoachEditDistance berghel; |
| |
| /** |
| * Short/unbounded implementation class: Myers bit-parallel. |
| */ |
| private MyersBitParallelEditDistance myers; |
| |
| /** |
| * Saved pattern, for specialized comparisons. |
| */ |
| private final CharSequence pattern; |
| |
| /** |
| * Length of saved pattern. |
| */ |
| private final int patternLength; |
| |
| private Levenshtein(CharSequence pattern) { |
| this.pattern = pattern; |
| this.patternLength = pattern.length(); |
| } |
| |
| public GeneralEditDistance duplicate() { |
| Levenshtein dup = new Levenshtein(pattern); |
| |
| /* Duplicate the Myers engine, as it is cheaper than rebuilding */ |
| if (this.myers != null) { |
| dup.myers = (MyersBitParallelEditDistance) this.myers.duplicate(); |
| } |
| |
| /* Do not duplicate the Berghel engine; it provides no savings. */ |
| |
| return dup; |
| } |
| |
| public int getDistance(CharSequence target, int limit) { |
| /* When the limit is 0 or 1, specialized comparisons are much faster. */ |
| if (limit <= 1) { |
| return limit == 0 ? |
| (pattern.equals(target) ? 0 : 1) : |
| atMostOneError(pattern, target); |
| } |
| |
| /* |
| * The best algorithm for long strings depends on the resulting |
| * edit distance (or the limit placed on it). Without further |
| * information on the likelihood of a low distance, we guess |
| * based on the provided limit. We currently lean toward using |
| * the Myers algorithm unless we are pretty sure that the |
| * Berghel-Roach algorithm will win (based on the limit). |
| * |
| * Note that when the string lengths are small (fewer characters |
| * than bits in a long), Myers wins regardless of limit. |
| */ |
| if ((patternLength > 64) |
| && (limit < (target.length() / 10))) { |
| if (berghel == null) { |
| berghel = ModifiedBerghelRoachEditDistance.getInstance(pattern); |
| } |
| return berghel.getDistance(target, limit); |
| } |
| |
| if (myers == null) { |
| myers = MyersBitParallelEditDistance.getInstance(pattern); |
| } |
| |
| return myers.getDistance(target, limit); |
| } |
| } |
| |
| /** |
| * Compares two strings for at most one insert/delete/substitute difference. |
| * Since operations cannot be composed, a simple case analysis is possible. |
| * |
| * @param s1 one string to be compared |
| * @param s2 the other string to be compared |
| * @return Levenshtein edit distance if no greater than 1; |
| * otherwise, more than 1 |
| */ |
| public static int atMostOneError(CharSequence s1, CharSequence s2) { |
| int s1Length = s1.length(); |
| int s2Length = s2.length(); |
| int errors = 0; /* running count of edits required */ |
| |
| switch(s2Length - s1Length) { |
| /* |
| * Strings are the same length. No single insert/delete is possible; |
| * at most one substitution can be present. |
| */ |
| case 0: |
| for (int i = 0; i < s2Length; i++) { |
| if ((s2.charAt(i) != s1.charAt(i)) && (errors++ != 0)) { |
| break; |
| } |
| } |
| return errors; |
| |
| /* |
| * Strings differ in length by 1, so we have an insertion |
| * (and therefore cannot have any other substitutions). |
| */ |
| case 1: /* s2Length > s1Length */ |
| for (int i = 0; i < s1Length; i++) { |
| if (s2.charAt(i) != s1.charAt(i)) { |
| for (; i < s1Length; i++) { |
| if (s2.charAt(i + 1) != s1.charAt(i)) { |
| return 2; |
| } |
| } |
| return 1; |
| } |
| } |
| return 1; |
| |
| /* Same as above case, with strings reversed */ |
| case -1: /* s1Length > s2Length */ |
| for (int i = 0; i < s2Length; i++) { |
| if (s2.charAt(i) != s1.charAt(i)) { |
| for (; i < s2Length; i++) { |
| if (s2.charAt(i) != s1.charAt(i + 1)) { |
| return 2; |
| } |
| } |
| return 1; |
| } |
| } |
| return 1; |
| |
| /* Edit distance is at least difference in lengths; more than 1 here. */ |
| default: |
| return 2; |
| } |
| } |
| |
| /** |
| * Generates an GeneralEditDistance engine for a particular pattern string |
| * based on Levenshtein distance. Caller must ensure that the |
| * pattern does not change (consider using pattern.toString() if |
| * necessary) as long as the generated object is to be used. |
| * |
| * @param pattern a string from which distance computations are desired |
| * @return an engine for computing Levenshtein distances from that pattern |
| */ |
| public static GeneralEditDistance |
| getLevenshteinDistance(CharSequence pattern) { |
| return new Levenshtein(pattern); |
| } |
| |
| private GeneralEditDistances() { } |
| } |