MtasTokenCollection.java
package mtas.analysis.token;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.util.BytesRef;
import mtas.analysis.util.MtasParserException;
/**
* The Class MtasTokenCollection.
*/
public class MtasTokenCollection {
/** The token collection. */
private HashMap<Integer, MtasToken> tokenCollection = new HashMap<>();
/** The token collection index. */
private ArrayList<Integer> tokenCollectionIndex = new ArrayList<>();
/**
* Instantiates a new mtas token collection.
*/
public MtasTokenCollection() {
clear();
}
/**
* Adds the.
*
* @param token the token
* @return the integer
*/
public Integer add(MtasToken token) {
Integer id = token.getId();
tokenCollection.put(id, token);
return id;
}
/**
* Gets the.
*
* @param id the id
* @return the mtas token
*/
public MtasToken get(Integer id) {
return tokenCollection.get(id);
}
/**
* Iterator.
*
* @return the iterator
* @throws MtasParserException the mtas parser exception
*/
public Iterator<MtasToken> iterator() throws MtasParserException {
checkTokenCollectionIndex();
return new Iterator<MtasToken>() {
private Iterator<Integer> indexIterator = tokenCollectionIndex.iterator();
@Override
public boolean hasNext() {
return indexIterator.hasNext();
}
@Override
public MtasToken next() {
return tokenCollection.get(indexIterator.next());
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
/**
* Prints the.
*
* @throws MtasParserException the mtas parser exception
*/
public void print() throws MtasParserException {
Iterator<MtasToken> it = this.iterator();
while (it.hasNext()) {
MtasToken token = it.next();
System.out.println(token);
}
}
/**
* Gets the list.
*
* @return the list
* @throws MtasParserException the mtas parser exception
*/
public String[][] getList() throws MtasParserException {
String[][] result = new String[(tokenCollection.size() + 1)][];
result[0] = new String[] { "id", "start real offset", "end real offset",
"provide real offset", "start offset", "end offset", "provide offset",
"start position", "end position", "multiple positions", "parent",
"provide parent", "payload", "prefix", "postfix" };
int number = 1;
Iterator<MtasToken> it = this.iterator();
while (it.hasNext()) {
MtasToken token = it.next();
String[] row = new String[15];
row[0] = token.getId().toString();
if (token.getRealOffsetStart() != null) {
row[1] = token.getRealOffsetStart().toString();
row[2] = token.getRealOffsetEnd().toString();
row[3] = token.getProvideRealOffset() ? "1" : null;
}
if (token.getOffsetStart() != null) {
row[4] = token.getOffsetStart().toString();
row[5] = token.getOffsetEnd().toString();
row[6] = token.getProvideOffset() ? "1" : null;
}
if (token.getPositionLength() != null) {
if (token.getPositionStart().equals(token.getPositionEnd())) {
row[7] = token.getPositionStart().toString();
row[8] = token.getPositionEnd().toString();
row[9] = null;
} else if ((token.getPositions() == null)
|| (token.getPositions().length == (1 + token.getPositionEnd()
- token.getPositionStart()))) {
row[7] = token.getPositionStart().toString();
row[8] = token.getPositionEnd().toString();
row[9] = null;
} else {
row[7] = null;
row[8] = null;
row[9] = Arrays.toString(token.getPositions());
}
}
if (token.getParentId() != null) {
row[10] = token.getParentId().toString();
row[11] = token.getProvideParentId() ? "1" : null;
}
if (token.getPayload() != null) {
BytesRef payload = token.getPayload();
row[12] = Float.toString(PayloadHelper.decodeFloat(Arrays.copyOfRange(
payload.bytes, payload.offset, (payload.offset + payload.length))));
}
row[13] = token.getPrefix();
row[14] = token.getPostfix();
result[number] = row;
number++;
}
return result;
}
/**
* Check.
*
* @param autoRepair the auto repair
* @param makeUnique the make unique
* @throws MtasParserException the mtas parser exception
*/
public void check(Boolean autoRepair, Boolean makeUnique)
throws MtasParserException {
if (autoRepair) {
autoRepair();
}
if (makeUnique) {
makeUnique();
}
checkTokenCollectionIndex();
for (Integer i : tokenCollectionIndex) {
// minimal properties
if (tokenCollection.get(i).getId() == null
|| tokenCollection.get(i).getPositionStart() == null
|| tokenCollection.get(i).getPositionEnd() == null
|| tokenCollection.get(i).getValue() == null) {
clear();
break;
}
}
}
/**
* Make unique.
*/
private void makeUnique() {
HashMap<String, ArrayList<MtasToken>> currentPositionTokens = new HashMap<>();
ArrayList<MtasToken> currentValueTokens;
int currentStartPosition = -1;
MtasToken currentToken = null;
for (Entry<Integer, MtasToken> entry : tokenCollection.entrySet()) {
currentToken = entry.getValue();
if (currentToken.getPositionStart() > currentStartPosition) {
currentPositionTokens.clear();
currentStartPosition = currentToken.getPositionStart();
} else {
if (currentPositionTokens.containsKey(currentToken.getValue())) {
currentValueTokens = currentPositionTokens
.get(currentToken.getValue());
} else {
currentValueTokens = new ArrayList<>();
currentPositionTokens.put(currentToken.getValue(),
currentValueTokens);
}
currentValueTokens.add(currentToken);
}
}
}
/**
* Auto repair.
*/
private void autoRepair() {
ArrayList<Integer> trash = new ArrayList<>();
HashMap<Integer, Integer> translation = new HashMap<>();
HashMap<Integer, MtasToken> newTokenCollection = new HashMap<>();
Integer parentId;
Integer maxId = null;
Integer minId = null;
MtasToken token;
// check id, position and value
for (Entry<Integer, MtasToken> entry : tokenCollection.entrySet()) {
token = entry.getValue();
boolean putInTrash;
putInTrash = token.getId() == null;
putInTrash |= (token.getPositionStart() == null)
|| (token.getPositionEnd() == null);
putInTrash |= token.getValue() == null || (token.getValue().isEmpty());
putInTrash |= token.getPrefix() == null || (token.getPrefix().isEmpty());
if (putInTrash) {
trash.add(entry.getKey());
}
}
// check parentId
for (Entry<Integer, MtasToken> entry : tokenCollection.entrySet()) {
token = entry.getValue();
parentId = token.getParentId();
if (parentId != null && (!tokenCollection.containsKey(parentId)
|| trash.contains(parentId))) {
token.setParentId(null);
}
}
// empty bin
if (!trash.isEmpty()) {
for (Integer i : trash) {
tokenCollection.remove(i);
}
}
// always check ids
if (tokenCollection.size() > 0) {
for (Integer i : tokenCollection.keySet()) {
maxId = ((maxId == null) ? i : Math.max(maxId, i));
minId = ((minId == null) ? i : Math.min(minId, i));
}
// check
if ((minId > 0) || ((1 + maxId - minId) != tokenCollection.size())) {
int newId = 0;
// create translation
for (Integer i : tokenCollection.keySet()) {
translation.put(i, newId);
newId++;
}
// translate objects
for (Entry<Integer, MtasToken> entry : tokenCollection.entrySet()) {
token = entry.getValue();
parentId = token.getParentId();
token.setId(translation.get(entry.getKey()));
if (parentId != null) {
token.setParentId(translation.get(parentId));
}
}
// new tokenCollection
Iterator<Map.Entry<Integer, MtasToken>> iter = tokenCollection
.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry<Integer, MtasToken> entry = iter.next();
newTokenCollection.put(translation.get(entry.getKey()),
entry.getValue());
iter.remove();
}
tokenCollection = newTokenCollection;
}
}
}
/**
* Check token collection index.
*
* @throws MtasParserException the mtas parser exception
*/
private void checkTokenCollectionIndex() throws MtasParserException {
if (tokenCollectionIndex.size() != tokenCollection.size()) {
MtasToken token;
Integer maxId = null;
Integer minId = null;
tokenCollectionIndex.clear();
for (Entry<Integer, MtasToken> entry : tokenCollection.entrySet()) {
token = entry.getValue();
maxId = ((maxId == null) ? entry.getKey()
: Math.max(maxId, entry.getKey()));
minId = ((minId == null) ? entry.getKey()
: Math.min(minId, entry.getKey()));
if (token.getId() == null) {
throw new MtasParserException(
"no id for token (" + token.getValue() + ")");
} else if ((token.getPositionStart() == null)
|| (token.getPositionEnd() == null)) {
throw new MtasParserException("no position for token with id "
+ token.getId() + " (" + token.getValue() + ")");
} else if (token.getValue() == null || (token.getValue().equals(""))) {
throw new MtasParserException(
"no value for token with id " + token.getId());
} else if (token.getPrefix() == null
|| (token.getPrefix().equals(""))) {
throw new MtasParserException(
"no prefix for token with id " + token.getId());
} else if ((token.getParentId() != null)
&& !tokenCollection.containsKey(token.getParentId())) {
throw new MtasParserException(
"missing parentId for token with id " + token.getId());
} else if ((token.getOffsetStart() == null)
|| (token.getOffsetEnd() == null)) {
throw new MtasParserException("missing offset for token with id "
+ token.getId() + " (" + token.getValue() + ")");
}
tokenCollectionIndex.add(entry.getKey());
}
if ((tokenCollection.size() > 0)
&& ((minId > 0) || ((1 + maxId - minId) != tokenCollection.size()))) {
throw new MtasParserException("missing ids");
}
Collections.sort(tokenCollectionIndex, getCompByName());
}
}
/**
* Gets the comp by name.
*
* @return the comp by name
*/
public Comparator<Integer> getCompByName() {
return new Comparator<Integer>() {
@Override
public int compare(Integer t1, Integer t2) {
Integer p1 = tokenCollection.get(t1).getPositionStart();
Integer p2 = tokenCollection.get(t2).getPositionStart();
assert p1 != null : "no position for " + tokenCollection.get(t1);
assert p2 != null : "no position for " + tokenCollection.get(t2);
if (p1.equals(p2)) {
Integer o1 = tokenCollection.get(t1).getOffsetStart();
Integer o2 = tokenCollection.get(t2).getOffsetStart();
if (o1 != null && o2 != null) {
if (o1.equals(o2)) {
return tokenCollection.get(t1).getValue()
.compareTo(tokenCollection.get(t2).getValue());
} else {
return o1.compareTo(o2);
}
} else {
return tokenCollection.get(t1).getValue()
.compareTo(tokenCollection.get(t2).getValue());
}
}
return p1.compareTo(p2);
}
};
}
/**
* Clear.
*/
private void clear() {
tokenCollectionIndex.clear();
tokenCollection.clear();
}
}