DamerauLevenshteinDistance.java

  1. package mtas.codec.util.distance;

  2. import java.io.IOException;
  3. import java.util.Arrays;
  4. import java.util.Map;
  5. import java.util.Map.Entry;

  6. import org.apache.lucene.util.BytesRef;

  7. /**
  8.  * The Class DamerauLevenshteinDistance.
  9.  */
  10. public class DamerauLevenshteinDistance extends LevenshteinDistance {

  11.   /** The Constant defaultTranspositionDistance. */
  12.   protected final static double defaultTranspositionDistance = 1.0;

  13.   /** The transposition distance. */
  14.   protected double transpositionDistance;

  15.   /** The Constant PARAMETER_TRANSPOSITIONDISTANCE. */
  16.   protected final static String PARAMETER_TRANSPOSITIONDISTANCE = "transpositionDistance";

  17.   /**
  18.    * Instantiates a new damerau levenshtein distance.
  19.    *
  20.    * @param prefix the prefix
  21.    * @param base the base
  22.    * @param maximum the maximum
  23.    * @param parameters the parameters
  24.    * @throws IOException Signals that an I/O exception has occurred.
  25.    */
  26.   public DamerauLevenshteinDistance(String prefix, String base, Double minimum, Double maximum,
  27.       Map<String, String> parameters) throws IOException {
  28.     super(prefix, base, minimum, maximum, parameters);
  29.     transpositionDistance = defaultTranspositionDistance;
  30.     if (parameters != null) {
  31.       for (Entry<String, String> entry : parameters.entrySet()) {
  32.         if (entry.getKey().equals(PARAMETER_TRANSPOSITIONDISTANCE)) {
  33.           transpositionDistance = Double.parseDouble(entry.getValue());
  34.         }
  35.       }
  36.     }
  37.     if (transpositionDistance < 0) {
  38.       throw new IOException("distances should be zero or positive");
  39.     }
  40.   }

  41.   /*
  42.    * (non-Javadoc)
  43.    *
  44.    * @see
  45.    * mtas.codec.util.distance.LevenshteinDistance#validate(org.apache.lucene.
  46.    * util.BytesRef)
  47.    */
  48.   @Override
  49.   public boolean validateMaximum(BytesRef term) {
  50.     if (maximum == null) {
  51.       return true;
  52.     } else {
  53.       double[][] state = _start();
  54.       char ch1;
  55.       char ch2 = 0x00;
  56.       int i = term.offset + prefixOffset;
  57.       for (; i < term.length; i++) {
  58.         ch1 = (char) term.bytes[i];
  59.         if (ch1 == 0x00) {
  60.           break;
  61.         }
  62.         state = _step(state, ch1, ch2);
  63.         if (!_can_match(state)) {
  64.           return false;
  65.         }
  66.         ch2 = ch1;
  67.       }
  68.       return _is_match(state);
  69.     }
  70.   }
  71.  
  72.   @Override
  73.   public double compute(BytesRef term) {
  74.     double[][] state = _start();
  75.     char ch1;
  76.     char ch2 = 0x00;
  77.     int i = term.offset + prefixOffset;
  78.     for (; i < term.length; i++) {
  79.       ch1 = (char) term.bytes[i];
  80.       if (ch1 == 0x00) {
  81.         break;
  82.       }
  83.       state = _step(state, ch1, ch2);
  84.       ch2 = ch1;
  85.     }
  86.     return _distance(state);
  87.   }

  88.   /*
  89.    * (non-Javadoc)
  90.    *
  91.    * @see mtas.codec.util.distance.LevenshteinDistance#compute(java.lang.String)
  92.    */
  93.   @Override
  94.   public double compute(String key) {
  95.     double[][] state = _start();
  96.     char ch2 = 0x00;
  97.     for (char ch1 : key.toCharArray()) {
  98.       if (ch1 == 0x00) {
  99.         break;
  100.       }
  101.       state = _step(state, ch1, ch2);
  102.       ch2 = ch1;
  103.     }
  104.     return _distance(state);
  105.   }

  106.   /**
  107.    * Start.
  108.    *
  109.    * @return the double[][]
  110.    */
  111.   private double[][] _start() {
  112.     double[][] startState = new double[3][];
  113.     startState[0] = new double[initialState.length];
  114.     startState[1] = new double[initialState.length];
  115.     startState[2] = Arrays.copyOf(initialState, initialState.length);
  116.     return startState;
  117.   }

  118.   /**
  119.    * Step.
  120.    *
  121.    * @param state the state
  122.    * @param ch1 the ch 1
  123.    * @param ch2 the ch 2
  124.    * @return the double[][]
  125.    */
  126.   private double[][] _step(double[][] state, char ch1, char ch2) {
  127.     double cost;
  128.     _shift(state);
  129.     state[2][0] = state[1][0] + deletionDistance;
  130.     for (int i = 0; i < base.length(); i++) {
  131.       cost = (base.charAt(i) == ch1) ? 0 : replaceDistance;
  132.       state[2][i + 1] = Math.min(state[2][i] + insertionDistance,
  133.           state[1][i] + cost);
  134.       state[2][i + 1] = Math.min(state[2][i + 1],
  135.           state[1][i + 1] + deletionDistance);
  136.       if (i > 0 && ch2 != 0x00 && (base.charAt(i - 1) == ch1)
  137.           && (base.charAt(i) == ch2)) {
  138.         state[2][i + 1] = Math.min(state[2][i + 1],
  139.             state[0][i - 1] + transpositionDistance);
  140.       }
  141.     }
  142.     return state;
  143.   }

  144.   /**
  145.    * Shift.
  146.    *
  147.    * @param state the state
  148.    */
  149.   private void _shift(double[][] state) {
  150.     double[] tmpState = state[0];
  151.     state[0] = state[1];
  152.     state[1] = state[2];
  153.     state[2] = tmpState;
  154.   }

  155.   /**
  156.    * Checks if is match.
  157.    *
  158.    * @param state the state
  159.    * @return true, if successful
  160.    */
  161.   private boolean _is_match(double[][] state) {
  162.     return state[2][state[2].length - 1] < maximum;
  163.   }

  164.   /**
  165.    * Can match.
  166.    *
  167.    * @param state the state
  168.    * @return true, if successful
  169.    */
  170.   private boolean _can_match(double[][] state) {
  171.     for (double d : state[2]) {
  172.       if (d < maximum) {
  173.         return true;
  174.       }
  175.     }
  176.     return false;
  177.   }

  178.   /**
  179.    * Distance.
  180.    *
  181.    * @param state the state
  182.    * @return the double
  183.    */
  184.   private double _distance(double[][] state) {
  185.     return state[2][state[2].length - 1];
  186.   }

  187. }