MtasSketchParser.java
package mtas.analysis.parser;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import mtas.analysis.token.MtasToken;
import mtas.analysis.token.MtasTokenCollection;
import mtas.analysis.token.MtasTokenIdFactory;
import mtas.analysis.util.MtasBufferedReader;
import mtas.analysis.util.MtasConfigException;
import mtas.analysis.util.MtasConfiguration;
import mtas.analysis.util.MtasParserException;
/**
* The Class MtasSketchParser.
*/
final public class MtasSketchParser extends MtasBasicParser {
/** The Constant log. */
private static final Log log = LogFactory.getLog(MtasSketchParser.class);
/** The word type. */
private MtasParserType<MtasParserMapping<?>> wordType = null;
/** The word annotation types. */
private HashMap<Integer, MtasParserType<MtasParserMapping<?>>> wordAnnotationTypes = new HashMap<>();
/** The group types. */
private HashMap<String, MtasParserType<MtasParserMapping<?>>> groupTypes = new HashMap<>();
/**
* Instantiates a new mtas sketch parser.
*
* @param config the config
*/
public MtasSketchParser(MtasConfiguration config) {
super(config);
autorepair = true;
try {
initParser();
// System.out.print(printConfig());
} catch (MtasConfigException e) {
log.error(e);
}
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasParser#initParser()
*/
@Override
protected void initParser() throws MtasConfigException {
super.initParser();
if (config != null) {
// always word, no mappings
wordType = new MtasParserType<>(MAPPING_TYPE_WORD, null, false);
for (int i = 0; i < config.children.size(); i++) {
MtasConfiguration current = config.children.get(i);
if (current.name.equals("mappings")) {
for (int j = 0; j < current.children.size(); j++) {
if (current.children.get(j).name.equals("mapping")) {
MtasConfiguration mapping = current.children.get(j);
String typeMapping = mapping.attributes.get("type");
String nameMapping = mapping.attributes.get("name");
if ((typeMapping != null)) {
if (typeMapping.equals(MAPPING_TYPE_WORD)) {
MtasSketchParserMappingWord m = new MtasSketchParserMappingWord();
m.processConfig(mapping);
wordType.addItem(m);
} else if (typeMapping.equals(MAPPING_TYPE_WORD_ANNOTATION)
&& (nameMapping != null)) {
MtasSketchParserMappingWordAnnotation m = new MtasSketchParserMappingWordAnnotation();
m.processConfig(mapping);
if (wordAnnotationTypes
.containsKey(Integer.parseInt(nameMapping))) {
wordAnnotationTypes.get(Integer.parseInt(nameMapping))
.addItem(m);
} else {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
typeMapping, nameMapping, false);
t.addItem(m);
wordAnnotationTypes.put(Integer.parseInt(nameMapping), t);
}
} else if (typeMapping.equals(MAPPING_TYPE_GROUP)
&& (nameMapping != null)) {
MtasSketchParserMappingGroup m = new MtasSketchParserMappingGroup();
m.processConfig(mapping);
if (groupTypes.containsKey(nameMapping)) {
groupTypes.get(nameMapping).addItem(m);
} else {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
typeMapping, nameMapping, false);
t.addItem(m);
groupTypes.put(nameMapping, t);
}
} else {
throw new MtasConfigException("unknown mapping type "
+ typeMapping + " or missing name");
}
}
}
}
}
}
}
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasParser#createTokenCollection(java.io.Reader)
*/
@Override
public MtasTokenCollection createTokenCollection(Reader reader)
throws MtasParserException, MtasConfigException {
AtomicInteger position = new AtomicInteger(0);
Integer unknownAncestors = 0;
Map<String, Set<Integer>> idPositions = new HashMap<>();
Map<String, Integer[]> idOffsets = new HashMap<>();
Map<String, Map<Integer, Set<String>>> updateList = createUpdateList();
Map<String, List<MtasParserObject>> currentList = createCurrentList();
tokenCollection = new MtasTokenCollection();
MtasTokenIdFactory mtasTokenIdFactory = new MtasTokenIdFactory();
try (MtasBufferedReader br = new MtasBufferedReader(reader)) {
String line;
int currentOffset;
int previousOffset = br.getPosition();
MtasParserType tmpCurrentType;
MtasParserObject currentObject;
Pattern groupPattern = Pattern.compile("^<([^\\/>]+)\\/>$");
Pattern groupStartPattern = Pattern
.compile("^<([^>\\/\\s][^>\\s]*)(|\\s[^>]+)>$");
Pattern groupEndPattern = Pattern.compile("^<\\/([^>\\s]+)>$");
Pattern attributePattern = Pattern.compile("([^\\s]+)=\"([^\"]*)\"");
while ((line = br.readLine()) != null) {
currentOffset = br.getPosition();
// group
if (line.trim().matches("^<[^>]*>$")) {
Matcher matcherGroupStart = groupStartPattern.matcher(line.trim());
Matcher matcherGroupEnd = groupEndPattern.matcher(line.trim());
Matcher matcherGroup = groupPattern.matcher(line.trim());
if (matcherGroup.find()) {
// full group, ignore
} else if (matcherGroupStart.find()) {
// start group
// System.out.println("Start "+matcherGroupStart.group(1)+" -
// "+matcherGroupStart.group(2));
if ((currentList.get(MAPPING_TYPE_WORD).isEmpty())
&& (currentList.get(MAPPING_TYPE_RELATION).isEmpty())
&& (currentList.get(MAPPING_TYPE_GROUP_ANNOTATION).isEmpty())
&& (tmpCurrentType = groupTypes
.get(matcherGroupStart.group(1))) != null) {
currentObject = new MtasParserObject(tmpCurrentType);
currentObject.setUnknownAncestorNumber(unknownAncestors);
currentObject.setRealOffsetStart(previousOffset);
String attributeText = matcherGroupStart.group(2).trim();
if (!attributeText.equals("")) {
Matcher matcherAttribute = attributePattern
.matcher(attributeText);
currentObject.objectAttributes = new HashMap<String, String>();
while (matcherAttribute.find()) {
currentObject.objectAttributes.put(matcherAttribute.group(1),
matcherAttribute.group(2));
}
}
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors++;
} else {
currentList.get(MAPPING_TYPE_GROUP).add(currentObject);
unknownAncestors = 0;
}
}
} else if (matcherGroupEnd.find()) {
// end group
if (!currentList.get(MAPPING_TYPE_GROUP).isEmpty()) {
if ((tmpCurrentType = groupTypes
.get(matcherGroupEnd.group(1))) != null) {
currentObject = currentList.get(MAPPING_TYPE_GROUP)
.remove(currentList.get(MAPPING_TYPE_GROUP).size() - 1);
assert unknownAncestors == 0 : "error in administration "
+ currentObject.getType().getName();
// ignore text: should not occur
currentObject.setRealOffsetEnd(currentOffset - 1);
idPositions.put(currentObject.getId(),
currentObject.getPositions());
idOffsets.put(currentObject.getId(), currentObject.getOffset());
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors = currentObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
}
}
}
} else {
if ((currentList.get(MAPPING_TYPE_RELATION).isEmpty())
&& (currentList.get(MAPPING_TYPE_GROUP_ANNOTATION).isEmpty())
&& (currentList.get(MAPPING_TYPE_WORD).isEmpty())
&& (currentList.get(MAPPING_TYPE_WORD_ANNOTATION).isEmpty())
&& (wordType != null)) {
// start word
currentObject = new MtasParserObject(wordType);
currentObject.setOffsetStart(previousOffset);
currentObject.setRealOffsetStart(previousOffset);
currentObject.setUnknownAncestorNumber(unknownAncestors);
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors++;
} else {
int p = position.getAndIncrement();
currentObject.addPosition(p);
currentList.get(MAPPING_TYPE_WORD).add(currentObject);
unknownAncestors = 0;
}
if ((currentList.get(MAPPING_TYPE_RELATION).isEmpty())
&& (currentList.get(MAPPING_TYPE_GROUP_ANNOTATION).isEmpty())
&& (!currentList.get(MAPPING_TYPE_WORD).isEmpty())) {
// start and finish word annotations
String[] items = line.split("\t");
for (int i = 0; i < items.length; i++) {
if ((tmpCurrentType = wordAnnotationTypes.get(i)) != null) {
// start word annotation
currentObject = new MtasParserObject(tmpCurrentType);
currentObject.setRealOffsetStart(previousOffset);
currentObject.addPositions(currentList.get(MAPPING_TYPE_WORD)
.get((currentList.get(MAPPING_TYPE_WORD).size() - 1))
.getPositions());
currentObject.setUnknownAncestorNumber(unknownAncestors);
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors++;
} else {
currentList.get(MAPPING_TYPE_WORD_ANNOTATION)
.add(currentObject);
unknownAncestors = 0;
}
// finish word annotation
if (unknownAncestors > 0) {
unknownAncestors--;
} else {
currentObject = currentList
.get(MAPPING_TYPE_WORD_ANNOTATION).remove(
currentList.get(MAPPING_TYPE_WORD_ANNOTATION).size()
- 1);
assert unknownAncestors == 0 : "error in administration "
+ currentObject.getType().getName();
currentObject.setText(items[i]);
currentObject.setRealOffsetEnd(currentOffset - 1);
idPositions.put(currentObject.getId(),
currentObject.getPositions());
idOffsets.put(currentObject.getId(),
currentObject.getOffset());
// offset always null, so update later with word (should be
// possible)
if ((currentObject.getId() != null)
&& (!currentList.get(MAPPING_TYPE_WORD).isEmpty())) {
currentList.get(MAPPING_TYPE_WORD)
.get((currentList.get(MAPPING_TYPE_WORD).size() - 1))
.addUpdateableIdWithOffset(currentObject.getId());
}
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors = currentObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
}
}
}
}
// finish word
if (unknownAncestors > 0) {
unknownAncestors--;
} else {
currentObject = currentList.get(MAPPING_TYPE_WORD)
.remove(currentList.get(MAPPING_TYPE_WORD).size() - 1);
assert unknownAncestors == 0 : "error in administration "
+ currentObject.getType().getName();
currentObject.setText(null);
currentObject.setOffsetEnd(currentOffset - 1);
currentObject.setRealOffsetEnd(currentOffset - 1);
// update ancestor groups with position and offset
for (MtasParserObject currentGroup : currentList
.get(MAPPING_TYPE_GROUP)) {
currentGroup.addPositions(currentObject.getPositions());
currentGroup.addOffsetStart(currentObject.getOffsetStart());
currentGroup.addOffsetEnd(currentObject.getOffsetEnd());
}
idPositions.put(currentObject.getId(),
currentObject.getPositions());
idOffsets.put(currentObject.getId(), currentObject.getOffset());
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors = currentObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
}
}
}
previousOffset = br.getPosition();
}
} catch (IOException e) {
log.debug(e);
throw new MtasParserException(e.getMessage());
}
// update tokens with offset
for (Entry<Integer, Set<String>> updateItem : updateList
.get(UPDATE_TYPE_OFFSET).entrySet()) {
for (String refId : updateItem.getValue()) {
Integer[] refOffset = idOffsets.get(refId);
if (refOffset != null) {
tokenCollection.get(updateItem.getKey()).addOffset(refOffset[0],
refOffset[1]);
}
}
}
// update tokens with position
for (Entry<Integer, Set<String>> updateItem : updateList
.get(UPDATE_TYPE_POSITION).entrySet()) {
for (String refId : updateItem.getValue()) {
MtasToken token = tokenCollection.get(updateItem.getKey());
token.addPositions(idPositions.get(refId));
}
}
// final check
tokenCollection.check(autorepair, makeunique);
return tokenCollection;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasParser#printConfig()
*/
@Override
public String printConfig() {
StringBuilder text = new StringBuilder();
text.append("=== CONFIGURATION ===\n");
text.append("type: " + wordAnnotationTypes.size() + " x wordAnnotation");
text.append(printConfigTypes(wordAnnotationTypes));
text.append("=== CONFIGURATION ===\n");
return text.toString();
}
/**
* Prints the config types.
*
* @param types the types
* @return the string
*/
private String printConfigTypes(
HashMap<?, MtasParserType<MtasParserMapping<?>>> types) {
StringBuilder text = new StringBuilder();
for (Entry<?, MtasParserType<MtasParserMapping<?>>> entry : types
.entrySet()) {
text.append("- " + entry.getKey() + ": " + entry.getValue().items.size()
+ " mapping(s)\n");
for (int i = 0; i < entry.getValue().items.size(); i++) {
text.append("\t" + entry.getValue().items.get(i) + "\n");
}
}
return text.toString();
}
/**
* The Class MtasSketchParserMappingWord.
*/
private class MtasSketchParserMappingWord
extends MtasParserMapping<MtasSketchParserMappingWord> {
/**
* Instantiates a new mtas sketch parser mapping word.
*/
public MtasSketchParserMappingWord() {
super();
this.position = SOURCE_OWN;
this.realOffset = SOURCE_OWN;
this.offset = SOURCE_OWN;
this.type = MAPPING_TYPE_WORD;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
*/
@Override
protected MtasSketchParserMappingWord self() {
return this;
}
}
/**
* The Class MtasSketchParserMappingWordAnnotation.
*/
private class MtasSketchParserMappingWordAnnotation
extends MtasParserMapping<MtasSketchParserMappingWordAnnotation> {
/**
* Instantiates a new mtas sketch parser mapping word annotation.
*/
public MtasSketchParserMappingWordAnnotation() {
super();
this.position = SOURCE_OWN;
this.realOffset = SOURCE_OWN;
this.offset = SOURCE_ANCESTOR_WORD;
this.type = MAPPING_TYPE_WORD_ANNOTATION;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasParser.MtasParserMapping#self()
*/
@Override
protected MtasSketchParserMappingWordAnnotation self() {
return this;
}
}
/**
* The Class MtasSketchParserMappingGroup.
*/
private class MtasSketchParserMappingGroup
extends MtasParserMapping<MtasSketchParserMappingGroup> {
/**
* Instantiates a new mtas sketch parser mapping group.
*/
public MtasSketchParserMappingGroup() {
super();
this.position = SOURCE_OWN;
this.realOffset = SOURCE_OWN;
this.offset = SOURCE_OWN;
this.type = MAPPING_TYPE_GROUP;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasFoliaParser.MtasFoliaParserMapping#self()
*/
@Override
protected MtasSketchParserMappingGroup self() {
return this;
}
}
}