MtasToken.java

package mtas.analysis.token;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;

/**
 * The Class MtasToken.
 */
public abstract class MtasToken {

  /** The Constant log. */
  private static final Log log = LogFactory.getLog(MtasToken.class);

  /** The Constant DELIMITER. */
  public static final String DELIMITER = "\u0001";

  /** The Constant regexpPrePostFix. */
  public static final String regexpPrePostFix = "(.*)" + DELIMITER
      + "(.[^\u0000]*)";

  /** The Constant patternPrePostFix. */
  public static final Pattern patternPrePostFix = Pattern
      .compile(regexpPrePostFix);

  /** The token id. */
  private Integer tokenId;

  /** The token ref. */
  private Long tokenRef = null;

  /** The term ref. */
  private Long termRef = null;

  /** The prefix id. */
  private Integer prefixId = null;

  /** The token type. */
  protected String tokenType = null;

  /** The token parent id. */
  private Integer tokenParentId = null;

  /** The token value. */
  private String tokenValue = null;

  /** The token position. */
  private MtasPosition tokenPosition = null;

  /** The token offset. */
  private MtasOffset tokenOffset = null;

  /** The token real offset. */
  private MtasOffset tokenRealOffset = null;

  /** The token payload. */
  private BytesRef tokenPayload = null;

  /** The provide offset. */
  private Boolean provideOffset = true;

  /** The provide real offset. */
  private Boolean provideRealOffset = true;

  /** The provide parent id. */
  private Boolean provideParentId = true;

  /**
   * Instantiates a new mtas token.
   *
   * @param tokenId the token id
   * @param value the value
   */
  protected MtasToken(Integer tokenId, String value) {
    this.tokenId = tokenId;
    setType();
    setValue(value);
  }

  /**
   * Instantiates a new mtas token.
   *
   * @param tokenId the token id
   * @param prefix the prefix
   * @param postfix the postfix
   */
  protected MtasToken(Integer tokenId, String prefix, String postfix) {
    Objects.requireNonNull(prefix, "prefix is obligatory");
    this.tokenId = tokenId;
    setType();
    if (postfix != null) {
      setValue(prefix + DELIMITER + postfix);
    } else {
      setValue(prefix + DELIMITER);
    }
  }

  /**
   * Instantiates a new mtas token.
   *
   * @param tokenId the token id
   * @param value the value
   * @param position the position
   */
  protected MtasToken(Integer tokenId, String value, Integer position) {
    this(tokenId, value);
    addPosition(position);
  }

  /**
   * Instantiates a new mtas token.
   *
   * @param tokenId the token id
   * @param prefix the prefix
   * @param postfix the postfix
   * @param position the position
   */
  protected MtasToken(Integer tokenId, String prefix, String postfix,
      Integer position) {
    this(tokenId, prefix, postfix);
    addPosition(position);
  }

  /**
   * Sets the token ref.
   *
   * @param ref the new token ref
   */
  final public void setTokenRef(Long ref) {
    tokenRef = ref;
  }

  /**
   * Gets the token ref.
   *
   * @return the token ref
   */
  final public Long getTokenRef() {
    return tokenRef;
  }

  /**
   * Sets the term ref.
   *
   * @param ref the new term ref
   */
  final public void setTermRef(Long ref) {
    termRef = ref;
  }

  /**
   * Gets the term ref.
   *
   * @return the term ref
   */
  final public Long getTermRef() {
    return termRef;
  }

  /**
   * Sets the prefix id.
   *
   * @param id the new prefix id
   */
  final public void setPrefixId(int id) {
    prefixId = id;
  }

  /**
   * Gets the prefix id.
   *
   * @return the prefix id
   * @throws IOException Signals that an I/O exception has occurred.
   */
  final public int getPrefixId() throws IOException {
    if (prefixId != null) {
      return prefixId;
    } else {
      throw new IOException("no prefixId");
    }
  }

  /**
   * Sets the id.
   *
   * @param id the new id
   */
  final public void setId(Integer id) {
    tokenId = id;
  }

  /**
   * Gets the id.
   *
   * @return the id
   */
  final public Integer getId() {
    return tokenId;
  }

  /**
   * Sets the parent id.
   *
   * @param id the new parent id
   */
  final public void setParentId(Integer id) {
    tokenParentId = id;
  }

  /**
   * Gets the parent id.
   *
   * @return the parent id
   */
  final public Integer getParentId() {
    return tokenParentId;
  }

  /**
   * Sets the provide parent id.
   *
   * @param provide the new provide parent id
   */
  final public void setProvideParentId(Boolean provide) {
    provideParentId = provide;
  }

  /**
   * Gets the provide parent id.
   *
   * @return the provide parent id
   */
  final public boolean getProvideParentId() {
    return provideParentId;
  }

  /**
   * Sets the type.
   */
  protected void setType() {
    throw new IllegalArgumentException("Type not implemented");
  }

  /**
   * Gets the type.
   *
   * @return the type
   */
  final public String getType() {
    return tokenType;
  }

  /**
   * Adds the position.
   *
   * @param position the position
   */
  final public void addPosition(int position) {
    if (tokenPosition == null) {
      tokenPosition = new MtasPosition(position);
    } else {
      tokenPosition.add(position);
    }
  }

  /**
   * Adds the position range.
   *
   * @param start the start
   * @param end the end
   */
  final public void addPositionRange(int start, int end) {
    if (tokenPosition == null) {
      tokenPosition = new MtasPosition(start, end);
    } else {
      int[] positions = new int[end - start + 1];
      for (int i = start; i <= end; i++) {
        positions[i - start] = i;
      }
      tokenPosition.add(positions);
    }
  }

  /**
   * Adds the positions.
   *
   * @param positions the positions
   */
  final public void addPositions(int[] positions) {
    if (positions != null && positions.length > 0) {
      if (tokenPosition == null) {
        tokenPosition = new MtasPosition(positions);
      } else {
        tokenPosition.add(positions);
      }
    }
  }

  /**
   * Adds the positions.
   *
   * @param list the list
   */
  final public void addPositions(Set<Integer> list) {
    int[] positions = ArrayUtils
        .toPrimitive(list.toArray(new Integer[list.size()]));
    addPositions(positions);
  }

  /**
   * Check position type.
   *
   * @param type the type
   * @return the boolean
   */
  final public Boolean checkPositionType(String type) {
    if (tokenPosition == null) {
      return false;
    } else {
      return tokenPosition.checkType(type);
    }
  }

  /**
   * Gets the position start.
   *
   * @return the position start
   */
  final public Integer getPositionStart() {
    return tokenPosition == null ? null : tokenPosition.getStart();
  }

  /**
   * Gets the position end.
   *
   * @return the position end
   */
  final public Integer getPositionEnd() {
    return tokenPosition == null ? null : tokenPosition.getEnd();
  }

  /**
   * Gets the position length.
   *
   * @return the position length
   */
  final public Integer getPositionLength() {
    return tokenPosition == null ? null : tokenPosition.getLength();
  }

  /**
   * Gets the positions.
   *
   * @return the positions
   */
  final public int[] getPositions() {
    return tokenPosition == null ? null : tokenPosition.getPositions();
  }

  /**
   * Check offset.
   *
   * @return the boolean
   */
  final public Boolean checkOffset() {
    if ((tokenOffset == null) || !provideOffset) {
      return false;
    } else {
      return true;
    }
  }

  /**
   * Check real offset.
   *
   * @return the boolean
   */
  final public Boolean checkRealOffset() {
    if ((tokenRealOffset == null) || !provideRealOffset) {
      return false;
    } else if (tokenOffset == null) {
      return true;
    } else if (tokenOffset.getStart() == tokenRealOffset.getStart()
        && tokenOffset.getEnd() == tokenRealOffset.getEnd()) {
      return false;
    } else {
      return true;
    }
  }

  /**
   * Sets the offset.
   *
   * @param start the start
   * @param end the end
   */
  final public void setOffset(Integer start, Integer end) {
    if ((start == null) || (end == null)) {
      // do nothing
    } else if (start > end) {
      throw new IllegalArgumentException("Start offset after end offset");
    } else {
      tokenOffset = new MtasOffset(start, end);
    }
  }

  /**
   * Adds the offset.
   *
   * @param start the start
   * @param end the end
   */
  final public void addOffset(Integer start, Integer end) {
    if (tokenOffset == null) {
      setOffset(start, end);
    } else if ((start == null) || (end == null)) {
      // do nothing
    } else if (start > end) {
      throw new IllegalArgumentException("Start offset after end offset");
    } else {
      tokenOffset.add(start, end);
    }
  }

  /**
   * Sets the provide offset.
   *
   * @param provide the new provide offset
   */
  final public void setProvideOffset(Boolean provide) {
    provideOffset = provide;
  }

  /**
   * Sets the real offset.
   *
   * @param start the start
   * @param end the end
   */
  final public void setRealOffset(Integer start, Integer end) {
    if ((start == null) || (end == null)) {
      // do nothing
    } else if (start > end) {
      throw new IllegalArgumentException(
          "Start real offset after end real offset");
    } else {
      tokenRealOffset = new MtasOffset(start, end);
    }
  }

  /**
   * Sets the provide real offset.
   *
   * @param provide the new provide real offset
   */
  final public void setProvideRealOffset(Boolean provide) {
    provideRealOffset = provide;
  }

  /**
   * Gets the provide offset.
   *
   * @return the provide offset
   */
  final public boolean getProvideOffset() {
    return provideOffset;
  }

  /**
   * Gets the provide real offset.
   *
   * @return the provide real offset
   */
  final public boolean getProvideRealOffset() {
    return provideRealOffset;
  }

  /**
   * Gets the offset start.
   *
   * @return the offset start
   */
  final public Integer getOffsetStart() {
    return tokenOffset == null ? null : tokenOffset.getStart();
  }

  /**
   * Gets the offset end.
   *
   * @return the offset end
   */
  final public Integer getOffsetEnd() {
    return tokenOffset == null ? null : tokenOffset.getEnd();
  }

  /**
   * Gets the real offset start.
   *
   * @return the real offset start
   */
  final public Integer getRealOffsetStart() {
    return tokenRealOffset == null ? null : tokenRealOffset.getStart();
  }

  /**
   * Gets the real offset end.
   *
   * @return the real offset end
   */
  final public Integer getRealOffsetEnd() {
    return tokenRealOffset == null ? null : tokenRealOffset.getEnd();
  }

  /**
   * Sets the value.
   *
   * @param value the new value
   */
  public void setValue(String value) {
    tokenValue = value;
  }

  /**
   * Gets the prefix from value.
   *
   * @param value the value
   * @return the prefix from value
   */
  public static String getPrefixFromValue(String value) {
    if (value == null) {
      return null;
    } else if (value.contains(DELIMITER)) {
      String[] list = value.split(DELIMITER);
      if (list != null && list.length > 0) {
        return list[0].replaceAll("\u0000", "");
      } else {
        return null;
      }
    } else {
      return value.replaceAll("\u0000", "");
    }
  }

  /**
   * Gets the postfix from value.
   *
   * @param value the value
   * @return the postfix from value
   */
  public static String getPostfixFromValue(String value) {
    String postfix = "";
    Matcher m = patternPrePostFix.matcher(value);
    if (m.find()) {
      postfix = m.group(2);

    }
    return postfix;
  }

  /**
   * Gets the postfix from value.
   *
   * @param term the term
   * @return the postfix from value
   */
  public static String getPostfixFromValue(BytesRef term) {
    int i = term.offset;
    int length = term.offset + term.length;
    byte[] postfix = new byte[length];
    while (i < length) {
      if ((term.bytes[i] & 0b10000000) == 0b00000000) {
        if (term.bytes[i] == 0b00000001) {
          i++;
          break;
        } else {
          i++;
        }
      } else if ((term.bytes[i] & 0b11100000) == 0b11000000) {
        i += 2;
      } else if ((term.bytes[i] & 0b11110000) == 0b11100000) {
        i += 3;
      } else if ((term.bytes[i] & 0b11111000) == 0b11110000) {
        i += 4;
      } else if ((term.bytes[i] & 0b11111100) == 0b11111000) {
        i += 5;
      } else if ((term.bytes[i] & 0b11111110) == 0b11111100) {
        i += 6;
      } else {
        return "";
      }
    }
    int start = i;
    while (i < length) {
      if ((term.bytes[i] & 0b10000000) == 0b00000000) {
        if (term.bytes[i] == 0b00000000) {
          break;
        }
        postfix[i] = term.bytes[i];
        i++;
      } else if ((term.bytes[i] & 0b11100000) == 0b11000000) {
        postfix[i] = term.bytes[i];
        postfix[i + 1] = term.bytes[i + 1];
        i += 2;
      } else if ((term.bytes[i] & 0b11110000) == 0b11100000) {
        postfix[i] = term.bytes[i];
        postfix[i + 1] = term.bytes[i + 1];
        postfix[i + 2] = term.bytes[i + 2];
        i += 3;
      } else if ((term.bytes[i] & 0b11111000) == 0b11110000) {
        postfix[i] = term.bytes[i];
        postfix[i + 1] = term.bytes[i + 1];
        postfix[i + 2] = term.bytes[i + 2];
        postfix[i + 3] = term.bytes[i + 3];
        i += 4;
      } else if ((term.bytes[i] & 0b11111100) == 0b11111000) {
        postfix[i] = term.bytes[i];
        postfix[i + 1] = term.bytes[i + 1];
        postfix[i + 2] = term.bytes[i + 2];
        postfix[i + 3] = term.bytes[i + 3];
        postfix[i + 4] = term.bytes[i + 4];
        i += 5;
      } else if ((term.bytes[i] & 0b11111110) == 0b11111100) {
        postfix[i] = term.bytes[i];
        postfix[i + 1] = term.bytes[i + 1];
        postfix[i + 2] = term.bytes[i + 2];
        postfix[i + 3] = term.bytes[i + 3];
        postfix[i + 4] = term.bytes[i + 4];
        postfix[i + 5] = term.bytes[i + 5];
        i += 6;
      } else {
        return "";
      }
    }
    return new String(Arrays.copyOfRange(postfix, start, i),
        StandardCharsets.UTF_8);
  }

  /**
   * Gets the value.
   *
   * @return the value
   */
  public String getValue() {
    return tokenValue;
  }

  /**
   * Gets the prefix.
   *
   * @return the prefix
   */
  public String getPrefix() {
    return getPrefixFromValue(tokenValue);
  }

  /**
   * Gets the postfix.
   *
   * @return the postfix
   */
  public String getPostfix() {
    return getPostfixFromValue(tokenValue);
  }

  /**
   * Check parent id.
   *
   * @return the boolean
   */
  final public Boolean checkParentId() {
    if ((tokenParentId == null) || !provideParentId) {
      return false;
    } else {
      return true;
    }
  }

  /**
   * Check payload.
   *
   * @return the boolean
   */
  final public Boolean checkPayload() {
    if (tokenPayload == null) {
      return false;
    } else {
      return true;
    }
  }

  /**
   * Sets the payload.
   *
   * @param payload the new payload
   */
  public void setPayload(BytesRef payload) {
    tokenPayload = payload;
  }

  /**
   * Gets the payload.
   *
   * @return the payload
   */
  public BytesRef getPayload() {
    return tokenPayload;
  }

  /**
   * Creates the automaton map.
   *
   * @param prefix the prefix
   * @param valueList the value list
   * @param filter the filter
   * @return the map
   */
  public static Map<String, Automaton> createAutomatonMap(String prefix,
      List<String> valueList, Boolean filter) {
    HashMap<String, Automaton> automatonMap = new HashMap<>();
    if (valueList != null) {
      for (String item : valueList) {
        if (filter) {
          item = item.replaceAll("([\\\"\\)\\(\\<\\>\\.\\@\\#\\]\\[\\{\\}])",
              "\\\\$1");
        }
        automatonMap.put(item,
            new RegExp(prefix + MtasToken.DELIMITER + item + "\u0000*")
                .toAutomaton());
      }
    }
    return automatonMap;
  }

  /**
   * Byte run automaton map.
   *
   * @param automatonMap the automaton map
   * @return the map
   */
  public static Map<String, ByteRunAutomaton> byteRunAutomatonMap(
      Map<String, Automaton> automatonMap) {
    HashMap<String, ByteRunAutomaton> byteRunAutomatonMap = new HashMap<>();
    if (automatonMap != null) {
      for (Entry<String, Automaton> entry : automatonMap.entrySet()) {
        byteRunAutomatonMap.put(entry.getKey(),
            new ByteRunAutomaton(entry.getValue()));
      }
    }
    return byteRunAutomatonMap;
  }

  /**
   * Creates the automata.
   *
   * @param prefix the prefix
   * @param regexp the regexp
   * @param automatonMap the automaton map
   * @return the list
   * @throws IOException Signals that an I/O exception has occurred.
   */
  public static List<CompiledAutomaton> createAutomata(String prefix,
      String regexp, Map<String, Automaton> automatonMap) throws IOException {
    List<CompiledAutomaton> list = new ArrayList<>();
    Automaton automatonRegexp = null;
    if (regexp != null) {
      RegExp re = new RegExp(prefix + MtasToken.DELIMITER + regexp + "\u0000*");
      automatonRegexp = re.toAutomaton();
    }
    int step = 500;
    List<String> keyList = new ArrayList<>(automatonMap.keySet());
    for (int i = 0; i < keyList.size(); i += step) {
      int localStep = step;
      boolean success = false;
      CompiledAutomaton compiledAutomaton = null;
      while (!success) {
        success = true;
        int next = Math.min(keyList.size(), i + localStep);
        List<Automaton> listAutomaton = new ArrayList<>();
        for (int j = i; j < next; j++) {
          listAutomaton.add(automatonMap.get(keyList.get(j)));
        }
        Automaton automatonList = Operations.union(listAutomaton);
        Automaton automaton;
        if (automatonRegexp != null) {
          automaton = Operations.intersection(automatonList, automatonRegexp);
        } else {
          automaton = automatonList;
        }
        try {
          compiledAutomaton = new CompiledAutomaton(automaton);
        } catch (TooComplexToDeterminizeException e) {
          log.debug(e);
          success = false;
          if (localStep > 1) {
            localStep /= 2;
          } else {
            throw new IOException("TooComplexToDeterminizeException");
          }
        }
      }
      list.add(compiledAutomaton);
    }
    return list;
  }

  /*
   * (non-Javadoc)
   * 
   * @see java.lang.Object#toString()
   */
  @Override
  public String toString() {
    String text = "";
    text += "[" + String.format("%05d", getId()) + "] ";
    text += ((getRealOffsetStart() == null) ? "[-------,-------]"
        : "[" + String.format("%07d", getRealOffsetStart()) + "-"
            + String.format("%07d", getRealOffsetEnd()) + "]");
    text += (provideRealOffset ? "  " : "* ");
    text += ((getOffsetStart() == null) ? "[-------,-------]"
        : "[" + String.format("%07d", getOffsetStart()) + "-"
            + String.format("%07d", getOffsetEnd()) + "]");
    text += (provideOffset ? "  " : "* ");
    if (getPositionLength() == null) {
      text += String.format("%11s", "");
    } else if (getPositionStart().equals(getPositionEnd())) {
      text += String.format("%11s", "[" + getPositionStart() + "]");
    } else if ((getPositions() == null) || (getPositions().length == (1
        + getPositionEnd() - getPositionStart()))) {
      text += String.format("%11s",
          "[" + getPositionStart() + "-" + getPositionEnd() + "]");
    } else {
      text += String.format("%11s", Arrays.toString(getPositions()));
    }
    text += ((getParentId() == null) ? "[-----]"
        : "[" + String.format("%05d", getParentId()) + "]");
    text += (provideParentId ? "  " : "* ");
    BytesRef payload = getPayload();
    text += (payload == null) ? "[------] "
        : "["
            + String
                .format("%.4f",
                    PayloadHelper.decodeFloat(Arrays.copyOfRange(payload.bytes,
                        payload.offset, (payload.offset + payload.length))))
            + "] ";
    text += String.format("%25s", "[" + getPrefix() + "]") + " ";
    text += ((getPostfix() == null) ? "---" : "[" + getPostfix() + "]") + " ";
    return text;
  }

}