MtasPennTreebankParser.java

package mtas.analysis.parser;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import mtas.analysis.token.MtasToken;
import mtas.analysis.token.MtasTokenCollection;
import mtas.analysis.token.MtasTokenIdFactory;
import mtas.analysis.token.MtasTokenString;
import mtas.analysis.util.MtasConfigException;
import mtas.analysis.util.MtasConfiguration;
import mtas.analysis.util.MtasParserException;
import mtas.analysis.util.MtasPennTreebankReader;

/**
 * The Class MtasPennTreebankParser.
 */
public class MtasPennTreebankParser extends MtasParser {

  /** The Constant log. */
  private static final Log log = LogFactory
      .getLog(MtasPennTreebankParser.class);

  /** The Constant PENNTREEBANK_IGNORE. */
  private static final String PENNTREEBANK_IGNORE = "ignore";

  /** The Constant PENNTREEBANK_NODE. */
  private static final String PENNTREEBANK_NODE = "node";

  /** The Constant PENNTREEBANK_NODE_NAME. */
  private static final String PENNTREEBANK_NODE_NAME = "name";

  /** The Constant NODE_CODE. */
  private static final String NODE_CODE = "CODE";

  /** The Constant NODE_CODE_PREFIX. */
  private static final String NODE_CODE_PREFIX = "$";

  /** The Constant STRING_SPLITTER. */
  private static final String STRING_SPLITTER = "_";
    
  /** The ignore nodes. */
  private Set<String> ignoreNodes = new HashSet<>();

  /**
   * Instantiates a new mtas penn treebank parser.
   *
   * @param config
   *          the config
   */
  public MtasPennTreebankParser(MtasConfiguration config) {
    super(config);
    try {
      initParser();
      // System.out.print(printConfig());
    } catch (MtasConfigException e) {
      log.error(e);
    }
  }

  /*
   * (non-Javadoc)
   * 
   * @see mtas.analysis.parser.MtasParser#initParser()
   */
  @Override
  protected void initParser() throws MtasConfigException {
    super.initParser();
    if (config != null) {
      for (int i = 0; i < config.children.size(); i++) {
        MtasConfiguration current = config.children.get(i);
        if (current.name.equals(PENNTREEBANK_IGNORE)) {
          for (int j = 0; j < current.children.size(); j++) {
            if (current.children.get(j).name.equals(PENNTREEBANK_NODE)) {
              String nameVariable = current.children.get(j).attributes
                  .get(PENNTREEBANK_NODE_NAME);
              if (!nameVariable.isEmpty()) {
                ignoreNodes.add(nameVariable);
              }
            }
          }
        }
      }
    }
  }

  /*
   * (non-Javadoc)
   * 
   * @see mtas.analysis.parser.MtasParser#createTokenCollection(java.io.Reader)
   */
  @Override
  public MtasTokenCollection createTokenCollection(Reader reader)
      throws MtasParserException, MtasConfigException {
    tokenCollection = new MtasTokenCollection();
    MtasTokenIdFactory mtasTokenIdFactory = new MtasTokenIdFactory();
    List<Level> levels = new ArrayList<>();
//    Map<String,MtasToken> referencesNode = new HashMap<>();
//    Map<String,List<MtasToken>> referencesNullElement = new HashMap<>();
    try {
      MtasPennTreebankReader treebankReader = new MtasPennTreebankReader(
          reader);
      // variables main administration
      int event = treebankReader.getEventType();
      int position = 0;
      boolean ignore = false;
      Level level = null;
      // variables for code
      List<Integer> codePositions = new ArrayList<>();
      Integer codeOffsetStart = null;
      Integer codeOffsetEnd = null;
      // variables for string
      String stringValue = null;
      int stringOffsetStart;
      int stringOffsetEnd;
      // loop
      while (true) {
        switch (event) {
        case MtasPennTreebankReader.EVENT_STARTBRACKET:
          if (level != null && level.code) {
            throw new MtasParserException(
                "unexpected start bracket for " + NODE_CODE);
          } else {
            level = new Level();
            level.ignore = ignore;
            level.realOffsetStart = treebankReader.getPosition() + 1;
            levels.add(level);
          }
          break;
        case MtasPennTreebankReader.EVENT_ENDBRACKET:
          Objects.requireNonNull(level, "no level while ending bracket");
          level.realOffsetEnd = treebankReader.getPosition() - 1;
          Level parentLevel = levels.size() > 1 ? levels.get(levels.size() - 2)
              : null;
          createNodeMappings(mtasTokenIdFactory, level, parentLevel);
          // remove level
          if (parentLevel != null) {
            if (level.positionStart != null && level.positionEnd != null) {
              parentLevel.addPositionRange(level.positionStart,
                  level.positionEnd);
            }
            parentLevel.offsetStart = parentLevel.offsetStart == null
                ? level.offsetStart : parentLevel.offsetStart;
            parentLevel.offsetEnd = level.offsetEnd == null
                ? parentLevel.offsetEnd : level.offsetEnd;
            levels.remove(levels.size() - 1);
            level = parentLevel;
            ignore = level.ignore;
          } else {
            levels.clear();
            level = null;
            ignore = false;
//            referencesNode.clear();
//            referencesNullElement.clear();
          }
          break;
        case MtasPennTreebankReader.EVENT_NODE:
          Objects.requireNonNull(level, "no level while handling node");
          // register node with level
          level.node = treebankReader.getString();
          if (ignoreNodes.contains(level.node)) {
            ignore = true;
            level.ignore = true;
          }
          if (level.node.equals(NODE_CODE)) {
            level.code = true;
            if (!treebankReader.next() || (event = treebankReader
                .getEventType()) != MtasPennTreebankReader.EVENT_STRING) {
              throw new MtasParserException("expected string for " + NODE_CODE);
            } else if (!level.ignore) {
              stringValue = treebankReader.getString();
              stringOffsetStart = treebankReader.getPosition();
              stringOffsetEnd = stringOffsetStart + stringValue.length();              
              if (!codePositions.isEmpty()) {
                createCodeMappings(mtasTokenIdFactory, level, stringValue,
                    codeOffsetStart, codeOffsetEnd, stringOffsetStart,
                    stringOffsetEnd, codePositions);
              } else {
                log.error("CODE without codePositions for "+stringValue);                
              }
              codePositions.clear();
              codeOffsetStart = null;
              codeOffsetEnd = null;
            }
          }
          break;
        case MtasPennTreebankReader.EVENT_STRING:
          Objects.requireNonNull(level, "no level while handling string");
          if (level.code) {
            throw new MtasParserException("unexpected string for " + NODE_CODE);
          } else if (!level.ignore) {
            stringValue = treebankReader.getString();
            stringOffsetStart = treebankReader.getPosition();
            stringOffsetEnd = stringOffsetStart + stringValue.length();
            if(level.offsetStart == null) {
              level.offsetStart = stringOffsetStart;
            }
            level.offsetEnd = stringOffsetEnd;
            if (stringValue.startsWith(NODE_CODE_PREFIX)) {
              codePositions.add(position);
              stringValue = stringValue.substring(NODE_CODE_PREFIX.length(),
                  stringValue.length());
              if(codeOffsetStart == null) {
                codeOffsetStart = stringOffsetStart;
              }
              codeOffsetEnd = stringOffsetEnd;
            }
            // register position
            level.addPosition(position);
            // create mappings
            createStringMappings(mtasTokenIdFactory, level, stringValue,
                stringOffsetStart, stringOffsetEnd, position);
            // increase position
            position++;
          }
          break;
        default:
          break;
        }
        if (!treebankReader.next()) {
          break;
        } else {
          event = treebankReader.getEventType();
        }
      }
    } catch (IOException e) {
      log.debug(e);
      throw new MtasParserException(
          "No valid Penn Treebank syntax: " + e.getMessage());
    }
    // final check
    tokenCollection.check(autorepair, makeunique);
    return tokenCollection;

  }

  /**
   * Creates the code mappings.
   *
   * @param mtasTokenIdFactory
   *          the mtas token id factory
   * @param level
   *          the level
   * @param stringValue
   *          the string value
   * @param offsetStart
   *          the offset start
   * @param offsetEnd
   *          the offset end
   * @param realOffsetStart
   *          the real offset start
   * @param realOffsetEnd
   *          the real offset end
   * @param codePositions
   *          the code positions
   * @throws IOException
   *           Signals that an I/O exception has occurred.
   */
  private void createCodeMappings(MtasTokenIdFactory mtasTokenIdFactory,
      Level level, String stringValue, int offsetStart, int offsetEnd,
      int realOffsetStart, int realOffsetEnd, List<Integer> codePositions)
      throws IOException {
    String[] stringValues = MtasPennTreebankReader.createStrings(stringValue,
        Pattern.quote(STRING_SPLITTER));
    MtasToken token = new MtasTokenString(mtasTokenIdFactory.createTokenId(),
        level.node, filterString(stringValues[0].trim()));
    token.setOffset(offsetStart, offsetEnd);
    token.setRealOffset(realOffsetStart, realOffsetEnd);
    token.addPositions(codePositions.stream().mapToInt(i -> i).toArray());
    tokenCollection.add(token);
    level.tokens.add(token);
  }

  /**
   * Creates the node mappings.
   *
   * @param mtasTokenIdFactory
   *          the mtas token id factory
   * @param level
   *          the level
   * @param parentLevel
   *          the parent level
   */
  private void createNodeMappings(MtasTokenIdFactory mtasTokenIdFactory,
      Level level, Level parentLevel) {
    MtasToken nodeToken;
    if (level.node != null && level.positionStart != null
        && level.positionEnd != null) {
      nodeToken = new MtasTokenString(mtasTokenIdFactory.createTokenId(),
          level.node, "");
      nodeToken.setOffset(level.offsetStart, level.offsetEnd);
      nodeToken.setRealOffset(level.realOffsetStart, level.realOffsetEnd);
      nodeToken.addPositionRange(level.positionStart, level.positionEnd);
      tokenCollection.add(nodeToken);
      if (parentLevel != null) {
        parentLevel.tokens.add(nodeToken);
      }
      // only for first mapping(?)
      for (MtasToken token : level.tokens) {
        token.setParentId(nodeToken.getId());
      }
    }
  }

  /**
   * Creates the string mappings.
   *
   * @param mtasTokenIdFactory
   *          the mtas token id factory
   * @param level
   *          the level
   * @param stringValue
   *          the string value
   * @param offsetStart
   *          the offset start
   * @param offsetEnd
   *          the offset end
   * @param position
   *          the position
   * @throws IOException
   *           Signals that an I/O exception has occurred.
   */
  private void createStringMappings(MtasTokenIdFactory mtasTokenIdFactory,
      Level level, String stringValue, int offsetStart, int offsetEnd,
      int position) throws IOException {
    // System.out.println("createStringMappings string ");
    String[] stringValues = MtasPennTreebankReader.createStrings(stringValue,
        Pattern.quote(STRING_SPLITTER));
    if (stringValues.length > 0 && !stringValues[0].trim().isEmpty()) {
      MtasToken token = new MtasTokenString(mtasTokenIdFactory.createTokenId(),
          "t", filterString(stringValues[0].trim()), position);
      token.setOffset(offsetStart, offsetEnd);
      tokenCollection.add(token);
      level.tokens.add(token);
    }
    if (stringValues.length > 1 && !stringValues[1].trim().isEmpty()) {
      MtasToken token = new MtasTokenString(mtasTokenIdFactory.createTokenId(),
          "lemma", filterString(stringValues[1].trim()), position);
      token.setOffset(offsetStart, offsetEnd);
      tokenCollection.add(token);
      level.tokens.add(token);
    }
  }

  private String filterString(String stringValue) {
    final String lrb = Pattern.quote("-LRB-");
    final String rrb = Pattern.quote("-RRB-");
    final String lcb = Pattern.quote("-LCB-");
    final String rcb = Pattern.quote("-RCB-");
    final String lsb = Pattern.quote("-LSB-");
    final String rsb = Pattern.quote("-RSB-");
    String filteredValue = stringValue.replaceAll("\\{TEXT:([^\\}]*)\\}", "$1");
    filteredValue = filteredValue.replaceAll(lrb, "(");
    filteredValue = filteredValue.replaceAll(rrb, ")");
    filteredValue = filteredValue.replaceAll(lcb, "{");
    filteredValue = filteredValue.replaceAll(rcb, "}");
    filteredValue = filteredValue.replaceAll(lsb, "[");
    filteredValue = filteredValue.replaceAll(rsb, "]");
    return filteredValue;
  }
  
  public String[] filterNullElementReferences(String[] stringValues) {
    Objects.requireNonNull(stringValues, "no stringValues");
    final Pattern pattern = Pattern.compile("^([^"+Pattern.quote("-")+"]*)"+Pattern.quote("-")+"([0-9]+("+Pattern.quote("-")+"[0-9]+)*)$");
    if(stringValues.length>0) {  
      Matcher matcher = pattern.matcher(stringValues[0]);
      stringValues[0] = stringValues[0].replaceAll(Pattern.quote("-")+".*$", "");
      if(matcher.matches()) {
        return matcher.group(2).split(Pattern.quote("-"));
      }
      
    }
    return new String[0];
  }

  /*
   * (non-Javadoc)
   * 
   * @see mtas.analysis.parser.MtasParser#printConfig()
   */
  @Override
  public String printConfig() {
    StringBuilder text = new StringBuilder();
    text.append("=== CONFIGURATION ===\n");
    text.append(config.toString());
    text.append("=== CONFIGURATION ===\n");
    return text.toString();
  }

  /**
   * The Class Level.
   */
  public static class Level {

    /** The node. */
    public String node;

    /** The offset start. */
    public Integer offsetStart;

    /** The offset end. */
    public Integer offsetEnd;

    /** The real offset start. */
    public Integer realOffsetStart;

    /** The real offset end. */
    public Integer realOffsetEnd;

    /** The ignore. */
    public boolean ignore;

    /** The code. */
    public boolean code;

    /** The position start. */
    public Integer positionStart;

    /** The position end. */
    public Integer positionEnd;

    /** The tokens. */
    public List<MtasToken> tokens;    

    /**
     * Instantiates a new level.
     */
    public Level() {
      node = null;
      offsetStart = null;
      offsetEnd = null;
      realOffsetStart = null;
      realOffsetEnd = null;
      ignore = false;
      code = false;
      positionStart = null;
      positionEnd = null;
      tokens = new ArrayList<>();
    }

    /**
     * Adds the position.
     *
     * @param position
     *          the position
     */
    public void addPosition(int position) {
      positionStart = (positionStart == null) ? position
          : Math.min(positionStart, position);
      positionEnd = (positionEnd == null) ? position
          : Math.max(positionEnd, position);
    }

    /**
     * Adds the position range.
     *
     * @param startPosition
     *          the start position
     * @param endPosition
     *          the end position
     */
    public void addPositionRange(int startPosition, int endPosition) {
      positionStart = (positionStart == null) ? startPosition
          : Math.min(positionStart, startPosition);
      positionEnd = (positionEnd == null) ? endPosition
          : Math.max(positionEnd, endPosition);
    }

  }

}