MtasTokenCollection.java

package mtas.analysis.token;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.util.BytesRef;

import mtas.analysis.util.MtasParserException;

/**
 * The Class MtasTokenCollection.
 */
public class MtasTokenCollection {

  /** The token collection. */
  private HashMap<Integer, MtasToken> tokenCollection = new HashMap<>();

  /** The token collection index. */
  private ArrayList<Integer> tokenCollectionIndex = new ArrayList<>();

  /**
   * Instantiates a new mtas token collection.
   */
  public MtasTokenCollection() {
    clear();
  }

  /**
   * Adds the.
   *
   * @param token the token
   * @return the integer
   */
  public Integer add(MtasToken token) {
    Integer id = token.getId();
    tokenCollection.put(id, token);
    return id;
  }

  /**
   * Gets the.
   *
   * @param id the id
   * @return the mtas token
   */
  public MtasToken get(Integer id) {
    return tokenCollection.get(id);
  }

  /**
   * Iterator.
   *
   * @return the iterator
   * @throws MtasParserException the mtas parser exception
   */
  public Iterator<MtasToken> iterator() throws MtasParserException {
    checkTokenCollectionIndex();
    return new Iterator<MtasToken>() {

      private Iterator<Integer> indexIterator = tokenCollectionIndex.iterator();

      @Override
      public boolean hasNext() {
        return indexIterator.hasNext();
      }

      @Override
      public MtasToken next() {
        return tokenCollection.get(indexIterator.next());
      }

      @Override
      public void remove() {
        throw new UnsupportedOperationException();
      }
    };
  }

  /**
   * Prints the.
   *
   * @throws MtasParserException the mtas parser exception
   */
  public void print() throws MtasParserException {
    Iterator<MtasToken> it = this.iterator();
    while (it.hasNext()) {
      MtasToken token = it.next();
      System.out.println(token);
    }
  }

  /**
   * Gets the list.
   *
   * @return the list
   * @throws MtasParserException the mtas parser exception
   */
  public String[][] getList() throws MtasParserException {
    String[][] result = new String[(tokenCollection.size() + 1)][];
    result[0] = new String[] { "id", "start real offset", "end real offset",
        "provide real offset", "start offset", "end offset", "provide offset",
        "start position", "end position", "multiple positions", "parent",
        "provide parent", "payload", "prefix", "postfix" };
    int number = 1;
    Iterator<MtasToken> it = this.iterator();
    while (it.hasNext()) {
      MtasToken token = it.next();
      String[] row = new String[15];
      row[0] = token.getId().toString();
      if (token.getRealOffsetStart() != null) {
        row[1] = token.getRealOffsetStart().toString();
        row[2] = token.getRealOffsetEnd().toString();
        row[3] = token.getProvideRealOffset() ? "1" : null;
      }
      if (token.getOffsetStart() != null) {
        row[4] = token.getOffsetStart().toString();
        row[5] = token.getOffsetEnd().toString();
        row[6] = token.getProvideOffset() ? "1" : null;
      }
      if (token.getPositionLength() != null) {
        if (token.getPositionStart().equals(token.getPositionEnd())) {
          row[7] = token.getPositionStart().toString();
          row[8] = token.getPositionEnd().toString();
          row[9] = null;
        } else if ((token.getPositions() == null)
            || (token.getPositions().length == (1 + token.getPositionEnd()
                - token.getPositionStart()))) {
          row[7] = token.getPositionStart().toString();
          row[8] = token.getPositionEnd().toString();
          row[9] = null;
        } else {
          row[7] = null;
          row[8] = null;
          row[9] = Arrays.toString(token.getPositions());
        }
      }
      if (token.getParentId() != null) {
        row[10] = token.getParentId().toString();
        row[11] = token.getProvideParentId() ? "1" : null;
      }
      if (token.getPayload() != null) {
        BytesRef payload = token.getPayload();
        row[12] = Float.toString(PayloadHelper.decodeFloat(Arrays.copyOfRange(
            payload.bytes, payload.offset, (payload.offset + payload.length))));
      }
      row[13] = token.getPrefix();
      row[14] = token.getPostfix();
      result[number] = row;
      number++;
    }
    return result;
  }

  /**
   * Check.
   *
   * @param autoRepair the auto repair
   * @param makeUnique the make unique
   * @throws MtasParserException the mtas parser exception
   */
  public void check(Boolean autoRepair, Boolean makeUnique)
      throws MtasParserException {
    if (autoRepair) {
      autoRepair();
    }
    if (makeUnique) {
      makeUnique();
    }
    checkTokenCollectionIndex();
    for (Integer i : tokenCollectionIndex) {
      // minimal properties
      if (tokenCollection.get(i).getId() == null
          || tokenCollection.get(i).getPositionStart() == null
          || tokenCollection.get(i).getPositionEnd() == null
          || tokenCollection.get(i).getValue() == null) {
        clear();
        break;
      }
    }
  }

  /**
   * Make unique.
   */
  private void makeUnique() {
    HashMap<String, ArrayList<MtasToken>> currentPositionTokens = new HashMap<>();
    ArrayList<MtasToken> currentValueTokens;
    int currentStartPosition = -1;
    MtasToken currentToken = null;
    for (Entry<Integer, MtasToken> entry : tokenCollection.entrySet()) {
      currentToken = entry.getValue();
      if (currentToken.getPositionStart() > currentStartPosition) {
        currentPositionTokens.clear();
        currentStartPosition = currentToken.getPositionStart();
      } else {
        if (currentPositionTokens.containsKey(currentToken.getValue())) {
          currentValueTokens = currentPositionTokens
              .get(currentToken.getValue());

        } else {
          currentValueTokens = new ArrayList<>();
          currentPositionTokens.put(currentToken.getValue(),
              currentValueTokens);
        }
        currentValueTokens.add(currentToken);
      }
    }
  }

  /**
   * Auto repair.
   */
  private void autoRepair() {
    ArrayList<Integer> trash = new ArrayList<>();
    HashMap<Integer, Integer> translation = new HashMap<>();
    HashMap<Integer, MtasToken> newTokenCollection = new HashMap<>();
    Integer parentId;
    Integer maxId = null;
    Integer minId = null;
    MtasToken token;
    // check id, position and value
    for (Entry<Integer, MtasToken> entry : tokenCollection.entrySet()) {
      token = entry.getValue();
      boolean putInTrash;
      putInTrash = token.getId() == null;
      putInTrash |= (token.getPositionStart() == null)
          || (token.getPositionEnd() == null);
      putInTrash |= token.getValue() == null || (token.getValue().isEmpty());
      putInTrash |= token.getPrefix() == null || (token.getPrefix().isEmpty());
      if (putInTrash) {
        trash.add(entry.getKey());
      }
    }
    // check parentId
    for (Entry<Integer, MtasToken> entry : tokenCollection.entrySet()) {
      token = entry.getValue();
      parentId = token.getParentId();
      if (parentId != null && (!tokenCollection.containsKey(parentId)
          || trash.contains(parentId))) {
        token.setParentId(null);
      }
    }
    // empty bin
    if (!trash.isEmpty()) {
      for (Integer i : trash) {
        tokenCollection.remove(i);
      }
    }
    // always check ids
    if (tokenCollection.size() > 0) {
      for (Integer i : tokenCollection.keySet()) {
        maxId = ((maxId == null) ? i : Math.max(maxId, i));
        minId = ((minId == null) ? i : Math.min(minId, i));
      }
      // check
      if ((minId > 0) || ((1 + maxId - minId) != tokenCollection.size())) {
        int newId = 0;
        // create translation
        for (Integer i : tokenCollection.keySet()) {
          translation.put(i, newId);
          newId++;
        }
        // translate objects
        for (Entry<Integer, MtasToken> entry : tokenCollection.entrySet()) {
          token = entry.getValue();
          parentId = token.getParentId();
          token.setId(translation.get(entry.getKey()));
          if (parentId != null) {
            token.setParentId(translation.get(parentId));
          }
        }
        // new tokenCollection
        Iterator<Map.Entry<Integer, MtasToken>> iter = tokenCollection
            .entrySet().iterator();
        while (iter.hasNext()) {
          Map.Entry<Integer, MtasToken> entry = iter.next();
          newTokenCollection.put(translation.get(entry.getKey()),
              entry.getValue());
          iter.remove();
        }
        tokenCollection = newTokenCollection;
      }
    }
  }

  /**
   * Check token collection index.
   *
   * @throws MtasParserException the mtas parser exception
   */
  private void checkTokenCollectionIndex() throws MtasParserException {
    if (tokenCollectionIndex.size() != tokenCollection.size()) {
      MtasToken token;
      Integer maxId = null;
      Integer minId = null;
      tokenCollectionIndex.clear();
      for (Entry<Integer, MtasToken> entry : tokenCollection.entrySet()) {
        token = entry.getValue();
        maxId = ((maxId == null) ? entry.getKey()
            : Math.max(maxId, entry.getKey()));
        minId = ((minId == null) ? entry.getKey()
            : Math.min(minId, entry.getKey()));
        if (token.getId() == null) {
          throw new MtasParserException(
              "no id for token (" + token.getValue() + ")");
        } else if ((token.getPositionStart() == null)
            || (token.getPositionEnd() == null)) {
          throw new MtasParserException("no position for token with id "
              + token.getId() + " (" + token.getValue() + ")");
        } else if (token.getValue() == null || (token.getValue().equals(""))) {
          throw new MtasParserException(
              "no value for token with id " + token.getId());
        } else if (token.getPrefix() == null
            || (token.getPrefix().equals(""))) {
          throw new MtasParserException(
              "no prefix for token with id " + token.getId());
        } else if ((token.getParentId() != null)
            && !tokenCollection.containsKey(token.getParentId())) {
          throw new MtasParserException(
              "missing parentId for token with id " + token.getId());
        } else if ((token.getOffsetStart() == null)
            || (token.getOffsetEnd() == null)) {
          throw new MtasParserException("missing offset for token with id "
              + token.getId() + " (" + token.getValue() + ")");
        }
        tokenCollectionIndex.add(entry.getKey());
      }
      if ((tokenCollection.size() > 0)
          && ((minId > 0) || ((1 + maxId - minId) != tokenCollection.size()))) {
        throw new MtasParserException("missing ids");
      }
      Collections.sort(tokenCollectionIndex, getCompByName());
    }
  }

  /**
   * Gets the comp by name.
   *
   * @return the comp by name
   */
  public Comparator<Integer> getCompByName() {
    return new Comparator<Integer>() {
      @Override
      public int compare(Integer t1, Integer t2) {
        Integer p1 = tokenCollection.get(t1).getPositionStart();
        Integer p2 = tokenCollection.get(t2).getPositionStart();
        assert p1 != null : "no position for " + tokenCollection.get(t1);
        assert p2 != null : "no position for " + tokenCollection.get(t2);
        if (p1.equals(p2)) {
          Integer o1 = tokenCollection.get(t1).getOffsetStart();
          Integer o2 = tokenCollection.get(t2).getOffsetStart();
          if (o1 != null && o2 != null) {
            if (o1.equals(o2)) {
              return tokenCollection.get(t1).getValue()
                  .compareTo(tokenCollection.get(t2).getValue());
            } else {
              return o1.compareTo(o2);
            }
          } else {
            return tokenCollection.get(t1).getValue()
                .compareTo(tokenCollection.get(t2).getValue());
          }
        }
        return p1.compareTo(p2);
      }
    };
  }

  /**
   * Clear.
   */
  private void clear() {
    tokenCollectionIndex.clear();
    tokenCollection.clear();
  }

}