MtasTokenizer.java

package mtas.analysis;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.Iterator;
import mtas.analysis.parser.MtasParser;
import mtas.analysis.token.MtasToken;
import mtas.analysis.token.MtasTokenCollection;
import mtas.analysis.util.MtasConfigException;
import mtas.analysis.util.MtasConfiguration;
import mtas.analysis.util.MtasParserException;
import mtas.codec.payload.MtasPayloadEncoder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeFactory;

/**
 * The Class MtasTokenizer.
 */

public final class MtasTokenizer extends Tokenizer {

  /** The Constant log. */
  private static final Log log = LogFactory.getLog(MtasTokenizer.class);

  /** The Constant CONFIGURATION_MTAS. */
  public static final String CONFIGURATION_MTAS = "mtas";

  public static final String CONFIGURATION_MTAS_INDEX = "index";
  public static final String CONFIGURATION_MTAS_INDEX_ATTRIBUTE = "index";

  public static final String CONFIGURATION_MTAS_PARSER = "parser";
  public static final String CONFIGURATION_MTAS_PARSER_ATTRIBUTE = "name";

  private static final String VALUE_TRUE = "true";
  private static final String VALUE_FALSE = "false";
  private static final String VALUE_0 = "0";
  private static final String VALUE_1 = "1";
  
  /** The current position. */
  private int currentPosition = 0;

  /** The encoding flags. */
  private int encodingFlags = MtasPayloadEncoder.ENCODE_DEFAULT;

  /** The parser name. */
  private String parserName = null;

  /** The parser configuration. */
  private MtasConfiguration parserConfiguration = null;

  /** The token collection. */
  private MtasTokenCollection tokenCollection;

  /** The term att. */
  private final CharTermAttribute termAtt = addAttribute(
      CharTermAttribute.class);

  /** The offset att. */
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

  /** The payload att. */
  private final PayloadAttribute payloadAtt = addAttribute(
      PayloadAttribute.class);

  /** The position increment att. */
  private final PositionIncrementAttribute positionIncrementAtt = addAttribute(
      PositionIncrementAttribute.class);

  /** The token collection iterator. */
  private Iterator<MtasToken> tokenCollectionIterator;

  /**
   * Instantiates a new mtas tokenizer.
   */
  public MtasTokenizer() {
  }

  /**
   * Instantiates a new mtas tokenizer.
   *
   * @param configFileName the config file name
   */
  public MtasTokenizer(final String configFileName) {
    readConfigurationFile(configFileName);
  }

  /**
   * Instantiates a new mtas tokenizer.
   *
   * @param config the config
   * @throws IOException Signals that an I/O exception has occurred.
   */
  public MtasTokenizer(final MtasConfiguration config) throws IOException {
    processConfiguration(config);
  }

  /**
   * Instantiates a new mtas tokenizer.
   *
   * @param reader the reader
   * @throws IOException Signals that an I/O exception has occurred.
   */
  public MtasTokenizer(final InputStream reader) throws IOException {
    processConfiguration(MtasConfiguration.readConfiguration(reader));
  }

  /**
   * Instantiates a new mtas tokenizer.
   *
   * @param factory the factory
   * @param config the config
   * @throws IOException Signals that an I/O exception has occurred.
   */
  public MtasTokenizer(final AttributeFactory factory,
      final MtasConfiguration config) throws IOException {
    super(factory);
    processConfiguration(config);
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.analysis.TokenStream#incrementToken()
   */
  @Override
  public boolean incrementToken() throws IOException {
    clearAttributes();
    MtasToken token;
    Integer positionIncrement;
    MtasPayloadEncoder payloadEncoder;
    if (tokenCollectionIterator == null) {
      return false;
    } else if (tokenCollectionIterator.hasNext()) {
      token = tokenCollectionIterator.next();
      // compute info
      positionIncrement = token.getPositionStart() - currentPosition;
      currentPosition = token.getPositionStart();
      payloadEncoder = new MtasPayloadEncoder(token, encodingFlags);
      // set info
      termAtt.append(token.getValue());
      positionIncrementAtt.setPositionIncrement(positionIncrement);
      offsetAtt.setOffset(token.getOffsetStart(), token.getOffsetEnd());
      payloadAtt.setPayload(payloadEncoder.getPayload());
      return true;
    }
    return false;
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.analysis.Tokenizer#reset()
   */
  @Override
  public void reset() throws IOException {
    super.reset();
    currentPosition = -1;
    try {
      constructTokenCollection(input);
      tokenCollectionIterator = tokenCollection.iterator();
    } catch (MtasConfigException | MtasParserException e) {      
      tokenCollectionIterator = null;
      throw new IOException(e);
    }
  }

  /**
   * Prints the.
   *
   * @param r the r
   * @throws MtasParserException the mtas parser exception
   */
  public void print(final Reader r) throws MtasParserException {
    try {
      setReader(r);
      reset();
      if (tokenCollection != null) {
        tokenCollection.print();
      }
      end();
      close();
    } catch (IOException e) {
      log.error(e);
      throw new MtasParserException(e.getClass() + " : " + e.getMessage());
    }
  }

  /**
   * Gets the list.
   *
   * @param r the r
   * @return the list
   * @throws IOException Signals that an I/O exception has occurred.
   */
  public String[][] getList(final Reader r) throws IOException {
    try {
      setReader(r);
      reset();
      String[][] result = tokenCollection.getList();
      end();
      close();
      return result;
    } catch (MtasParserException e) {
      log.info(e);
      throw new IOException("can't produce list");
    }
  }

  /**
   * Construct token collection.
   *
   * @param reader the reader
   * @throws MtasConfigException the mtas config exception
   * @throws MtasParserException the mtas parser exception
   */
  private void constructTokenCollection(final Reader reader)
      throws MtasConfigException, MtasParserException {
    tokenCollection = null;
    try {
      Constructor<?> c = Class.forName(parserName)
          .getDeclaredConstructor(MtasConfiguration.class);      
      Object p = c.newInstance(parserConfiguration);
      if (p instanceof MtasParser) {
        MtasParser parser = (MtasParser) p;
        tokenCollection = parser.createTokenCollection(reader);
        return;
      } else {
        throw new MtasConfigException("no instance of MtasParser");
      }
    } catch (MtasParserException e) {
      log.debug(e);
      tokenCollection = new MtasTokenCollection();
      throw new MtasParserException(e.getMessage());
    } catch (NoSuchMethodException | InvocationTargetException
        | IllegalAccessException | ClassNotFoundException
        | InstantiationException e) {
      log.debug(e);
      throw new MtasConfigException(
          e.getClass().getName() + " : '" + e.getMessage() + "'");
    }

  }

  /**
   * Read configuration file.
   *
   * @param configFile the config file
   */
  private void readConfigurationFile(final String configFile) {
    InputStream is;
    try {
      is = new FileInputStream(configFile);
      processConfiguration(MtasConfiguration.readConfiguration(is));
      is.close();
    } catch (FileNotFoundException e) {
      log.error("Couldn't find " + configFile, e);
    } catch (IOException e) {
      log.error("Couldn't read " + configFile, e);
    }
  }

  /**
   * Process configuration.
   *
   * @param config the config
   * @throws IOException Signals that an I/O exception has occurred.
   */
  private void processConfiguration(final MtasConfiguration config)
      throws IOException {
    HashMap<String, Integer> indexEncodingMapper = new HashMap<>();
    indexEncodingMapper.put("payload", MtasPayloadEncoder.ENCODE_PAYLOAD);
    indexEncodingMapper.put("offset", MtasPayloadEncoder.ENCODE_OFFSET);
    indexEncodingMapper.put("realoffset", MtasPayloadEncoder.ENCODE_REALOFFSET);
    indexEncodingMapper.put("parent", MtasPayloadEncoder.ENCODE_PARENT);
    // process
    if (config != null) {
      for (int i = 0; i < config.children.size(); i++) {
        if (config.children.get(i).name.equals(CONFIGURATION_MTAS_INDEX)) {
          MtasConfiguration index = config.children.get(i);
          for (int j = 0; j < index.children.size(); j++) {
            if (indexEncodingMapper.containsKey(index.children.get(j).name)) {
              String value = index.children.get(j).attributes.get(CONFIGURATION_MTAS_INDEX_ATTRIBUTE);
              if ((value.equals(VALUE_TRUE)) || (value.equals(VALUE_1))) {
                encodingFlags |= indexEncodingMapper
                    .get(index.children.get(j).name);
              } else if ((value.equals(VALUE_FALSE)) || (value.equals(VALUE_0))) {
                encodingFlags &= ~indexEncodingMapper
                    .get(index.children.get(j).name);
              }
            }
          }
        } else if (config.children.get(i).name.equals(CONFIGURATION_MTAS_PARSER)) {
          if (config.children.get(i).attributes.containsKey(CONFIGURATION_MTAS_PARSER_ATTRIBUTE)) {
            parserName = config.children.get(i).attributes.get(CONFIGURATION_MTAS_PARSER_ATTRIBUTE);
            parserConfiguration = config.children.get(i);
          } else {
            throw new IOException("no parser configuration");
          }
        }
      }
    } else {
      throw new IOException("no (valid) configuration");
    }
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.util.AttributeSource#equals(java.lang.Object)
   */
  @Override
  public boolean equals(Object obj) {
    if (this == obj)
      return true;
    if (obj == null)
      return false;
    if (getClass() != obj.getClass())
      return false;
    final MtasTokenizer that = (MtasTokenizer) obj;
    return super.equals(that);
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.util.AttributeSource#hashCode()
   */
  @Override
  public int hashCode() {
    return super.hashCode();
  }

}