MtasSpanRecurrenceSpans.java

package mtas.search.spans;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.spans.SpanCollector;
import org.apache.lucene.search.spans.Spans;

import mtas.search.spans.util.MtasIgnoreItem;
import mtas.search.spans.util.MtasSpans;

/**
 * The Class MtasSpanRecurrenceSpans.
 */
public class MtasSpanRecurrenceSpans extends MtasSpans {

  /** The log. */
  private static Log log = LogFactory.getLog(MtasSpanRecurrenceSpans.class);

  /** The query. */
  private MtasSpanRecurrenceQuery query;

  /** The spans. */
  private Spans spans;

  /** The ignore item. */
  private MtasIgnoreItem ignoreItem;

  /** The minimum recurrence. */
  int minimumRecurrence;

  /** The maximum recurrence. */
  int maximumRecurrence;

  /** The queue spans. */
  List<Match> queueSpans;

  /** The queue matches. */
  List<Match> queueMatches;

  /** The current match. */
  Match currentMatch;

  /** The no more positions. */
  boolean noMorePositions;

  /** The last start position. */
  int lastStartPosition; // startPosition of last retrieved span

  /** The last span. */
  boolean lastSpan; // last span for this document added to queue

  /**
   * Instantiates a new mtas span recurrence spans.
   *
   * @param query the query
   * @param spans the spans
   * @param minimumRecurrence the minimum recurrence
   * @param maximumRecurrence the maximum recurrence
   * @param ignoreSpans the ignore spans
   * @param maximumIgnoreLength the maximum ignore length
   */
  public MtasSpanRecurrenceSpans(MtasSpanRecurrenceQuery query, Spans spans,
      int minimumRecurrence, int maximumRecurrence, Spans ignoreSpans,
      Integer maximumIgnoreLength) {
    assert minimumRecurrence <= maximumRecurrence : "minimumRecurrence > maximumRecurrence";
    assert minimumRecurrence > 0 : "minimumRecurrence < 1 not supported";
    this.query = query;
    this.spans = spans;
    this.minimumRecurrence = minimumRecurrence;
    this.maximumRecurrence = maximumRecurrence;
    queueSpans = new ArrayList<>();
    queueMatches = new ArrayList<>();
    ignoreItem = new MtasIgnoreItem(ignoreSpans, maximumIgnoreLength);
    resetQueue();
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.search.spans.Spans#nextStartPosition()
   */
  @Override
  public int nextStartPosition() throws IOException {
    if (findMatches()) {
      currentMatch = queueMatches.get(0);
      queueMatches.remove(0);
      noMorePositions = false;
      return currentMatch.startPosition();
    } else {
      currentMatch = null;
      noMorePositions = true;
      return NO_MORE_POSITIONS;
    }
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.search.spans.Spans#startPosition()
   */
  @Override
  public int startPosition() {
    if (currentMatch == null) {
      if (noMorePositions) {
        return NO_MORE_POSITIONS;
      } else {
        return -1;
      }
    } else {
      return currentMatch.startPosition();
    }
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.search.spans.Spans#endPosition()
   */
  @Override
  public int endPosition() {
    if (currentMatch == null) {
      if (noMorePositions) {
        return NO_MORE_POSITIONS;
      } else {
        return -1;
      }
    } else {
      return currentMatch.endPosition();
    }
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.search.spans.Spans#width()
   */
  @Override
  public int width() {
    return 1;
  }

  /*
   * (non-Javadoc)
   * 
   * @see
   * org.apache.lucene.search.spans.Spans#collect(org.apache.lucene.search.spans
   * .SpanCollector)
   */
  @Override
  public void collect(SpanCollector collector) throws IOException {
    spans.collect(collector);
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.search.DocIdSetIterator#docID()
   */
  @Override
  public int docID() {
    return spans.docID();
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.search.DocIdSetIterator#nextDoc()
   */
  @Override
  public int nextDoc() throws IOException {
    resetQueue();
    return (spans.nextDoc() == NO_MORE_DOCS) ? NO_MORE_DOCS : toMatchDoc();
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.search.DocIdSetIterator#advance(int)
   */
  @Override
  public int advance(int target) throws IOException {
    resetQueue();
    return (spans.advance(target) == NO_MORE_DOCS) ? NO_MORE_DOCS
        : toMatchDoc();
  }

  /**
   * Reset queue.
   */
  void resetQueue() {
    queueSpans.clear();
    queueMatches.clear();
    lastStartPosition = 0;
    lastSpan = false;
    currentMatch = null;
    noMorePositions = false;
  }

  /**
   * To match doc.
   *
   * @return the int
   * @throws IOException Signals that an I/O exception has occurred.
   */
  int toMatchDoc() throws IOException {
    while (true) {
      if (findMatches()) {
        return docID();
      }
      resetQueue();
      if (spans.nextDoc() == NO_MORE_DOCS) {
        return NO_MORE_DOCS;
      }
    }
  }

  /**
   * Collect span.
   *
   * @return true, if successful
   * @throws IOException Signals that an I/O exception has occurred.
   */
  // try to get something in the queue of spans
  private boolean collectSpan() throws IOException {
    if (lastSpan) {
      return false;
    } else if (spans.nextStartPosition() == NO_MORE_POSITIONS) {
      lastSpan = true;
      return false;
    } else {
      queueSpans.add(new Match(spans.startPosition(), spans.endPosition()));
      lastStartPosition = spans.startPosition();
      return true;
    }
  }

  /**
   * Find matches.
   *
   * @return true, if successful
   * @throws IOException Signals that an I/O exception has occurred.
   */
  private boolean findMatches() throws IOException {
    // check for something in queue of matches
    if (!queueMatches.isEmpty()) {
      return true;
    } else {
      ignoreItem.advanceToDoc(spans.docID());
      while (true) {
        // try to get something in queue of spans
        if (queueSpans.isEmpty() && !collectSpan()) {
          return false;
        }
        // try to get matches with first span in queue
        Match firstMatch = queueSpans.remove(0);
        // create a list of matches with same startPosition as firstMatch
        List<Match> matches = new ArrayList<>();
        matches.add(firstMatch);
        // matches.addAll(expandWithIgnoreItem(spans.docID(), firstMatch));
        // try to collect spans until lastStartPosition not equal to
        // startPosition of firstMatch
        while (!lastSpan && (lastStartPosition == firstMatch.startPosition())) {
          collectSpan();
        }
        while (!queueSpans.isEmpty() && (queueSpans.get(0)
            .startPosition() == firstMatch.startPosition())) {
          Match additionalMatch = queueSpans.remove(0);
          matches.add(additionalMatch);
          matches.addAll(expandWithIgnoreItem(spans.docID(), additionalMatch));
        }
        // construct all matches for this startPosition
        for (Match match : matches) {
          for (int n = (minimumRecurrence - 1); n <= (maximumRecurrence
              - 1); n++) {
            findMatches(match, n);
          }
        }
        // check for something in queue of matches
        if (!queueMatches.isEmpty()) {
          ignoreItem.removeBefore(spans.docID(),
              queueMatches.get(0).startPosition());
          return true;
        }
      }
    }
  }

  /**
   * Find matches.
   *
   * @param match the match
   * @param n the n
   * @throws IOException Signals that an I/O exception has occurred.
   */
  private void findMatches(Match match, int n) throws IOException {
    if (n > 0) {
      int largestMatchingEndPosition = match.endPosition();
      Set<Integer> list = ignoreItem.getFullEndPositionList(spans.docID(),
          match.endPosition());
      // try to find matches with existing queue
      if (!queueSpans.isEmpty()) {
        Match span;
        for (int i = 0; i < queueSpans.size(); i++) {
          span = queueSpans.get(i);
          if (match.endPosition() == span.startPosition()
              || (list != null && list.contains(span.startPosition()))) {
            findMatches(new Match(match.startPosition(), span.endPosition()),
                (n - 1));
            largestMatchingEndPosition = Math.max(largestMatchingEndPosition,
                span.endPosition());
          }
        }
      }
      // extend queue if necessary and possible
      while (!lastSpan && (largestMatchingEndPosition >= lastStartPosition)) {
        if (spans.nextStartPosition() == NO_MORE_POSITIONS) {
          lastSpan = true;
        } else {
          Match span = new Match(spans.startPosition(), spans.endPosition());
          queueSpans.add(span);
          lastStartPosition = spans.startPosition();
          // check if this provides new match
          if (match.endPosition() == span.startPosition()
              || (list != null && list.contains(span.startPosition()))) {
            findMatches(new Match(match.startPosition(), span.endPosition()),
                (n - 1));
            largestMatchingEndPosition = Math.max(largestMatchingEndPosition,
                span.endPosition());
          }
        }
      }
    } else {
      // only unique spans
      if (!queueMatches.contains(match)) {
        queueMatches.add(match);
      }
    }
  }

  /**
   * Expand with ignore item.
   *
   * @param docId the doc id
   * @param match the match
   * @return the list
   */
  private List<Match> expandWithIgnoreItem(int docId, Match match) {
    List<Match> list = new ArrayList<>();
    try {
      Set<Integer> ignoreList = ignoreItem.getFullEndPositionList(docId,
          match.endPosition);
      if (ignoreList != null) {
        for (Integer endPosition : ignoreList) {
          list.add(new Match(match.startPosition, endPosition));
        }
      }
    } catch (IOException e) {
      log.debug(e);
    }
    return list;
  }

  /**
   * The Class Match.
   */
  private static class Match {

    /** The start position. */
    private int startPosition;

    /** The end position. */
    private int endPosition;

    /**
     * Instantiates a new match.
     *
     * @param startPosition the start position
     * @param endPosition the end position
     */
    Match(int startPosition, int endPosition) {
      this.startPosition = startPosition;
      this.endPosition = endPosition;
    }

    /**
     * Start position.
     *
     * @return the int
     */
    public int startPosition() {
      return startPosition;
    }

    /**
     * End position.
     *
     * @return the int
     */
    public int endPosition() {
      return endPosition;
    }

    /*
     * (non-Javadoc)
     * 
     * @see java.lang.Object#equals(java.lang.Object)
     */
    @Override
    public boolean equals(Object obj) {
      if (this == obj)
        return true;
      if (obj == null)
        return false;
      if (getClass() != obj.getClass())
        return false;
      final Match that = (Match) obj;
      return startPosition == that.startPosition
          && endPosition == that.endPosition;
    }

    /*
     * (non-Javadoc)
     * 
     * @see java.lang.Object#hashCode()
     */
    @Override
    public int hashCode() {
      int h = this.getClass().getSimpleName().hashCode();
      h = (h * 5) ^ startPosition;
      h = (h * 7) ^ endPosition;
      return h;
    }

  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.search.DocIdSetIterator#cost()
   */
  @Override
  public long cost() {
    return (spans == null) ? 0 : spans.cost();
  }

  /*
   * (non-Javadoc)
   * 
   * @see org.apache.lucene.search.spans.Spans#positionsCost()
   */
  @Override
  public float positionsCost() {
    return 0;
  }

  /*
   * (non-Javadoc)
   * 
   * @see mtas.search.spans.util.MtasSpans#asTwoPhaseIterator()
   */
  @Override
  public TwoPhaseIterator asTwoPhaseIterator() {
    if (spans == null || !query.twoPhaseIteratorAllowed()) {
      return null;
    } else {
      // TODO
      return null;
    }
  }

}