MtasXMLParser.java
package mtas.analysis.parser;
import java.io.Reader;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import mtas.analysis.token.MtasToken;
import mtas.analysis.token.MtasTokenCollection;
import mtas.analysis.token.MtasTokenIdFactory;
import mtas.analysis.util.MtasConfigException;
import mtas.analysis.util.MtasParserException;
import mtas.analysis.util.MtasConfiguration;
/**
* The Class MtasXMLParser.
*/
abstract class MtasXMLParser extends MtasBasicParser {
/** The Constant log. */
private static final Log log = LogFactory.getLog(MtasXMLParser.class);
/** The namespace URI. */
protected String namespaceURI = null;
/** The namespace URI id. */
protected String namespaceURI_id = null;
/** The root tag. */
protected String rootTag = null;
/** The content tag. */
protected String contentTag = null;
/** The allow non content. */
protected boolean allowNonContent = false;
/** The relation key map. */
private Map<String, SortedSet<String>> relationKeyMap = new HashMap<>();
/** The q names. */
private Map<String, QName> qNames = new HashMap<>();
/** The relation types. */
private Map<QName, MtasParserType<MtasParserMapping<?>>> relationTypes = new HashMap<>();
/** The relation annotation types. */
private Map<QName, MtasParserType<MtasParserMapping<?>>> relationAnnotationTypes = new HashMap<>();
/** The ref types. */
private Map<QName, MtasParserType<MtasParserMapping<?>>> refTypes = new HashMap<>();
/** The group types. */
private Map<QName, MtasParserType<MtasParserMapping<?>>> groupTypes = new HashMap<>();
/** The group annotation types. */
private Map<QName, MtasParserType<MtasParserMapping<?>>> groupAnnotationTypes = new HashMap<>();
/** The word types. */
private Map<QName, MtasParserType<MtasParserMapping<?>>> wordTypes = new HashMap<>();
/** The word annotation types. */
private Map<QName, MtasParserType<MtasParserMapping<?>>> wordAnnotationTypes = new HashMap<>();
/** The variable types. */
private Map<QName, MtasParserType<MtasParserVariable>> variableTypes = new HashMap<>();
/** The Constant XML_VARIABLES. */
private static final String XML_VARIABLES = "variables";
/** The Constant XML_VARIABLE. */
private static final String XML_VARIABLE = "variable";
/** The Constant XML_VARIABLE_NAME. */
private static final String XML_VARIABLE_NAME = "name";
/** The Constant XML_VARIABLE_VALUE. */
private static final String XML_VARIABLE_VALUE = "value";
/** The Constant XML_REFERENCES. */
private static final String XML_REFERENCES = "references";
/** The Constant XML_REFERENCE. */
private static final String XML_REFERENCE = "reference";
/** The Constant XML_REFERENCE_NAME. */
private static final String XML_REFERENCE_NAME = "name";
/** The Constant XML_REFERENCE_REF. */
private static final String XML_REFERENCE_REF = "ref";
/** The Constant XML_MAPPINGS. */
private static final String XML_MAPPINGS = "mappings";
/** The Constant XML_MAPPING. */
private static final String XML_MAPPING = "mapping";
/** The Constant XML_MAPPING_TYPE. */
private static final String XML_MAPPING_TYPE = "type";
/** The Constant XML_MAPPING_NAME. */
private static final String XML_MAPPING_NAME = "name";
/**
* Instantiates a new mtas XML parser.
*
* @param config the config
*/
public MtasXMLParser(MtasConfiguration config) {
super(config);
try {
initParser();
// System.out.print(printConfig());
} catch (MtasConfigException e) {
log.error(e);
}
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasParser#printConfig()
*/
@Override
public String printConfig() {
StringBuilder text = new StringBuilder();
text.append("=== CONFIGURATION ===\n");
text.append("type: " + variableTypes.size() + " x variable\n");
text.append(printConfigVariableTypes(variableTypes));
text.append("type: " + groupTypes.size() + " x group\n");
text.append(printConfigMappingTypes(groupTypes));
text.append("type: " + groupAnnotationTypes.size() + " x groupAnnotation");
text.append(printConfigMappingTypes(groupAnnotationTypes));
text.append("type: " + wordTypes.size() + " x word\n");
text.append(printConfigMappingTypes(wordTypes));
text.append("type: " + wordAnnotationTypes.size() + " x wordAnnotation");
text.append(printConfigMappingTypes(wordAnnotationTypes));
text.append("type: " + relationTypes.size() + " x relation\n");
text.append(printConfigMappingTypes(relationTypes));
text.append(
"type: " + relationAnnotationTypes.size() + " x relationAnnotation\n");
text.append(printConfigMappingTypes(relationAnnotationTypes));
text.append("type: " + refTypes.size() + " x references\n");
text.append(printConfigMappingTypes(refTypes));
text.append("=== CONFIGURATION ===\n");
return text.toString();
}
/**
* Prints the config mapping types.
*
* @param types the types
* @return the string
*/
private String printConfigMappingTypes(
Map<QName, MtasParserType<MtasParserMapping<?>>> types) {
StringBuilder text = new StringBuilder();
for (Entry<QName, MtasParserType<MtasParserMapping<?>>> entry : types
.entrySet()) {
text.append("- " + entry.getKey().getLocalPart() + ": "
+ entry.getValue().items.size() + " mapping(s)\n");
for (int i = 0; i < entry.getValue().items.size(); i++) {
text.append("\t" + entry.getValue().items.get(i) + "\n");
}
}
return text.toString();
}
/**
* Prints the config variable types.
*
* @param types the types
* @return the string
*/
private String printConfigVariableTypes(
Map<QName, MtasParserType<MtasParserVariable>> types) {
StringBuilder text = new StringBuilder();
for (Entry<QName, MtasParserType<MtasParserVariable>> entry : types
.entrySet()) {
text.append("- " + entry.getKey().getLocalPart() + ": "
+ entry.getValue().items.size() + " variables(s)\n");
for (int i = 0; i < entry.getValue().items.size(); i++) {
text.append("\t" + entry.getValue().items.get(i) + "\n");
}
}
return text.toString();
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasParser#initParser()
*/
@Override
protected void initParser() throws MtasConfigException {
super.initParser();
if (config != null) {
// find namespaceURI
for (int i = 0; i < config.children.size(); i++) {
MtasConfiguration current = config.children.get(i);
if (current.name.equals("namespaceURI")) {
namespaceURI = current.attributes.get("value");
}
}
// loop again
for (int i = 0; i < config.children.size(); i++) {
MtasConfiguration current = config.children.get(i);
if (current.name.equals(XML_VARIABLES)) {
for (int j = 0; j < current.children.size(); j++) {
if (current.children.get(j).name.equals(XML_VARIABLE)) {
MtasConfiguration variable = current.children.get(j);
String nameVariable = variable.attributes.get(XML_VARIABLE_NAME);
String valueVariable = variable.attributes
.get(XML_VARIABLE_VALUE);
if ((nameVariable != null) && (valueVariable != null)) {
MtasParserVariable v = new MtasParserVariable(nameVariable,
valueVariable);
v.processConfig(variable);
QName qn = getQName(nameVariable);
if (variableTypes.containsKey(qn)) {
variableTypes.get(qn).addItem(v);
} else {
MtasParserType<MtasParserVariable> t = new MtasParserType<>(
nameVariable, valueVariable, false);
t.addItem(v);
variableTypes.put(qn, t);
}
}
}
}
} else if (current.name.equals(XML_REFERENCES)) {
for (int j = 0; j < current.children.size(); j++) {
if (current.children.get(j).name.equals(XML_REFERENCE)) {
MtasConfiguration reference = current.children.get(j);
String name = reference.attributes.get(XML_REFERENCE_NAME);
String ref = reference.attributes.get(XML_REFERENCE_REF);
if ((name != null) && (ref != null)) {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
MAPPING_TYPE_REF, name, false, ref);
refTypes.put(getQName(t.getName()), t);
}
}
}
} else if (current.name.equals(XML_MAPPINGS)) {
for (int j = 0; j < current.children.size(); j++) {
if (current.children.get(j).name.equals(XML_MAPPING)) {
MtasConfiguration mapping = current.children.get(j);
String typeMapping = mapping.attributes.get(XML_MAPPING_TYPE);
String nameMapping = mapping.attributes.get(XML_MAPPING_NAME);
if ((typeMapping != null) && (nameMapping != null)) {
if (typeMapping.equals(MAPPING_TYPE_RELATION)) {
MtasXMLParserMappingRelation m = new MtasXMLParserMappingRelation();
m.processConfig(mapping);
QName qn = getQName(nameMapping);
if (relationTypes.containsKey(qn)) {
relationTypes.get(qn).addItem(m);
} else {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
typeMapping, nameMapping, false);
t.addItem(m);
relationTypes.put(qn, t);
}
} else if (typeMapping
.equals(MAPPING_TYPE_RELATION_ANNOTATION)) {
MtasXMLParserMappingRelationAnnotation m = new MtasXMLParserMappingRelationAnnotation();
m.processConfig(mapping);
QName qn = getQName(nameMapping);
if (relationAnnotationTypes.containsKey(qn)) {
relationAnnotationTypes.get(qn).addItem(m);
} else {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
typeMapping, nameMapping, false);
t.addItem(m);
relationAnnotationTypes.put(qn, t);
}
} else if (typeMapping.equals(MAPPING_TYPE_WORD)) {
MtasXMLParserMappingWord m = new MtasXMLParserMappingWord();
m.processConfig(mapping);
QName qn = getQName(nameMapping);
if (wordTypes.containsKey(qn)) {
wordTypes.get(qn).addItem(m);
} else {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
typeMapping, nameMapping, false);
t.addItem(m);
wordTypes.put(qn, t);
}
} else if (typeMapping.equals(MAPPING_TYPE_WORD_ANNOTATION)) {
MtasXMLParserMappingWordAnnotation m = new MtasXMLParserMappingWordAnnotation();
m.processConfig(mapping);
QName qn = getQName(nameMapping);
if (wordAnnotationTypes.containsKey(qn)) {
wordAnnotationTypes.get(qn).addItem(m);
} else {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
typeMapping, nameMapping, false);
t.addItem(m);
wordAnnotationTypes.put(qn, t);
}
} else if (typeMapping.equals(MAPPING_TYPE_GROUP)) {
MtasXMLParserMappingGroup m = new MtasXMLParserMappingGroup();
m.processConfig(mapping);
QName qn = getQName(nameMapping);
if (groupTypes.containsKey(qn)) {
groupTypes.get(qn).addItem(m);
} else {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
typeMapping, nameMapping, false);
t.addItem(m);
groupTypes.put(qn, t);
}
} else if (typeMapping.equals(MAPPING_TYPE_GROUP_ANNOTATION)) {
MtasXMLParserMappingGroupAnnotation m = new MtasXMLParserMappingGroupAnnotation();
m.processConfig(mapping);
QName qn = getQName(nameMapping);
if (groupAnnotationTypes.containsKey(qn)) {
groupAnnotationTypes.get(qn).addItem(m);
} else {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
typeMapping, nameMapping, false);
t.addItem(m);
groupAnnotationTypes.put(qn, t);
}
} else {
throw new MtasConfigException(
"unknown mapping type " + typeMapping);
}
}
}
}
}
}
}
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasParser#createTokenCollection(java.io.Reader)
*/
@Override
public MtasTokenCollection createTokenCollection(Reader reader)
throws MtasParserException, MtasConfigException {
Boolean hasRoot = rootTag == null ? true : false;
Boolean parsingContent = contentTag == null ? true : false;
String textContent = null;
Integer unknownAncestors = 0;
Integer lastOffset = 0;
AtomicInteger position = new AtomicInteger(0);
Map<String, Set<Integer>> idPositions = new HashMap<>();
Map<String, Integer[]> idOffsets = new HashMap<>();
Map<String, Map<Integer, Set<String>>> updateList = createUpdateList();
Map<String, List<MtasParserObject>> currentList = createCurrentList();
Map<String, Map<String, String>> variables = createVariables();
tokenCollection = new MtasTokenCollection();
MtasTokenIdFactory mtasTokenIdFactory = new MtasTokenIdFactory();
XMLInputFactory factory = XMLInputFactory.newInstance();
try {
XMLStreamReader streamReader = factory.createXMLStreamReader(reader);
QName qname;
try {
int event = streamReader.getEventType();
MtasParserType<?> currentType;
MtasParserType<?> tmpCurrentType;
MtasParserType<?> tmpVariableType;
MtasParserObject currentObject = null;
MtasParserObject variableObject = null;
while (true) {
switch (event) {
case XMLStreamConstants.START_DOCUMENT:
log.debug("start of document");
String encodingScheme = streamReader.getCharacterEncodingScheme();
if (encodingScheme == null) {
//ignore for now
log.info("No encodingScheme found, assume utf-8");
//throw new MtasParserException("No encodingScheme found");
} else if (!encodingScheme.equalsIgnoreCase("utf-8")) {
throw new MtasParserException(
"XML not UTF-8 encoded but '" + encodingScheme + "'");
}
break;
case XMLStreamConstants.END_DOCUMENT:
log.debug("end of document");
break;
case XMLStreamConstants.SPACE:
// set offset (end of start-element)
lastOffset = streamReader.getLocation().getCharacterOffset();
break;
case XMLStreamConstants.START_ELEMENT:
// get data
qname = streamReader.getName();
// check for rootTag
if (!hasRoot) {
if (qname.equals(getQName(rootTag))) {
hasRoot = true;
} else {
throw new MtasParserException("No " + rootTag);
}
// parse content
} else {
if ((tmpVariableType = variableTypes.get(qname)) != null) {
variableObject = new MtasParserObject(tmpVariableType);
collectAttributes(variableObject, streamReader);
computeVariablesFromObject(variableObject, currentList,
variables);
}
if (parsingContent) {
// check for relation : not within word, not within
// groupAnnotation
if ((currentList.get(MAPPING_TYPE_WORD).isEmpty())
&& (currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
.isEmpty())
&& (tmpCurrentType = relationTypes.get(qname)) != null) {
currentObject = new MtasParserObject(tmpCurrentType);
collectAttributes(currentObject, streamReader);
currentObject.setUnknownAncestorNumber(unknownAncestors);
currentObject.setRealOffsetStart(lastOffset);
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors++;
} else {
currentType = tmpCurrentType;
currentList.get(MAPPING_TYPE_RELATION).add(currentObject);
unknownAncestors = 0;
}
// check for relation annotation: not within word, but within
// relation
} else if ((currentList.get(MAPPING_TYPE_WORD).isEmpty())
&& (!currentList.get(MAPPING_TYPE_RELATION).isEmpty())
&& (tmpCurrentType = relationAnnotationTypes
.get(qname)) != null) {
currentObject = new MtasParserObject(tmpCurrentType);
collectAttributes(currentObject, streamReader);
currentObject.setUnknownAncestorNumber(unknownAncestors);
currentObject.setRealOffsetStart(lastOffset);
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors++;
} else {
currentType = tmpCurrentType;
currentList.get(MAPPING_TYPE_RELATION_ANNOTATION)
.add(currentObject);
unknownAncestors = 0;
}
// check for group: not within word, not within relation, not
// within groupAnnotation
} else if ((currentList.get(MAPPING_TYPE_WORD).isEmpty())
&& (currentList.get(MAPPING_TYPE_RELATION).isEmpty())
&& (currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
.isEmpty())
&& (tmpCurrentType = groupTypes.get(qname)) != null) {
currentObject = new MtasParserObject(tmpCurrentType);
collectAttributes(currentObject, streamReader);
currentObject.setUnknownAncestorNumber(unknownAncestors);
currentObject.setRealOffsetStart(lastOffset);
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors++;
} else {
currentType = tmpCurrentType;
currentList.get(MAPPING_TYPE_GROUP).add(currentObject);
unknownAncestors = 0;
}
// check for group annotation: not within word, not within
// relation, but within group
} else if ((currentList.get(MAPPING_TYPE_WORD).isEmpty())
&& (currentList.get(MAPPING_TYPE_RELATION).isEmpty())
&& (!currentList.get(MAPPING_TYPE_GROUP).isEmpty())
&& (tmpCurrentType = groupAnnotationTypes
.get(qname)) != null) {
currentObject = new MtasParserObject(tmpCurrentType);
collectAttributes(currentObject, streamReader);
currentObject.setUnknownAncestorNumber(unknownAncestors);
currentObject.setRealOffsetStart(lastOffset);
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors++;
} else {
currentType = tmpCurrentType;
currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
.add(currentObject);
unknownAncestors = 0;
}
// check for word: not within relation, not within
// groupAnnotation, not within word, not within wordAnnotation
} else if ((currentList.get(MAPPING_TYPE_RELATION).isEmpty())
&& (currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
.isEmpty())
&& (currentList.get(MAPPING_TYPE_WORD).isEmpty())
&& (currentList.get(MAPPING_TYPE_WORD_ANNOTATION).isEmpty())
&& (tmpCurrentType = wordTypes.get(qname)) != null) {
currentObject = new MtasParserObject(tmpCurrentType);
collectAttributes(currentObject, streamReader);
currentObject.setUnknownAncestorNumber(unknownAncestors);
currentObject.setOffsetStart(lastOffset);
currentObject.setRealOffsetStart(lastOffset);
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors++;
} else {
currentType = tmpCurrentType;
currentObject.addPosition(position.getAndIncrement());
currentList.get(MAPPING_TYPE_WORD).add(currentObject);
unknownAncestors = 0;
}
// check for word annotation: not within relation, not within
// groupAnnotation, but within word
} else if ((currentList.get(MAPPING_TYPE_RELATION).isEmpty())
&& (currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
.isEmpty())
&& (!currentList.get(MAPPING_TYPE_WORD).isEmpty())
&& (tmpCurrentType = wordAnnotationTypes
.get(qname)) != null) {
currentObject = new MtasParserObject(tmpCurrentType);
collectAttributes(currentObject, streamReader);
currentObject.addPositions(currentList.get(MAPPING_TYPE_WORD)
.get((currentList.get(MAPPING_TYPE_WORD).size() - 1))
.getPositions());
currentObject.setUnknownAncestorNumber(unknownAncestors);
currentObject.setRealOffsetStart(lastOffset);
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors++;
} else {
currentType = tmpCurrentType;
currentList.get(MAPPING_TYPE_WORD_ANNOTATION)
.add(currentObject);
unknownAncestors = 0;
}
// check for references: within relation
} else if (!currentList.get(MAPPING_TYPE_RELATION).isEmpty()
&& (tmpCurrentType = refTypes.get(qname)) != null) {
currentObject = new MtasParserObject(tmpCurrentType);
collectAttributes(currentObject, streamReader);
currentObject.setUnknownAncestorNumber(unknownAncestors);
currentObject.setRealOffsetStart(lastOffset);
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors++;
} else {
currentType = tmpCurrentType;
currentList.get(MAPPING_TYPE_REF).add(currentObject);
unknownAncestors = 0;
// add reference to ancestor relations
for (MtasParserObject currentRelation : currentList
.get(MAPPING_TYPE_RELATION)) {
currentRelation.addRefId(currentObject
.getAttribute(currentType.getRefAttributeName()));
// register mapping for relation (for recursive relations)
SortedSet<String> keyMapList;
if (currentRelation.getId() != null) {
if (relationKeyMap
.containsKey(currentRelation.getId())) {
keyMapList = relationKeyMap
.get(currentRelation.getId());
} else {
keyMapList = new TreeSet<>();
relationKeyMap.put(currentRelation.getId(),
keyMapList);
}
keyMapList.add(currentObject
.getAttribute(currentType.getRefAttributeName()));
}
}
}
} else {
unknownAncestors++;
}
// check for start content
} else if (qname.equals(getQName(contentTag))) {
parsingContent = true;
// unexpected
} else if (!allowNonContent) {
throw new MtasParserException(
"Unexpected " + qname.getLocalPart() + " in document");
}
}
// set offset (end of start-element)
lastOffset = streamReader.getLocation().getCharacterOffset();
break;
case XMLStreamConstants.END_ELEMENT:
// set offset (end of end-element)
lastOffset = streamReader.getLocation().getCharacterOffset();
// get data
qname = streamReader.getName();
// parse content
if (parsingContent) {
if (unknownAncestors > 0) {
unknownAncestors--;
// check for reference: because otherwise currentList should
// contain no references
} else if (!currentList.get(MAPPING_TYPE_REF).isEmpty()) {
if ((currentType = refTypes.get(qname)) != null) {
currentObject = currentList.get(MAPPING_TYPE_REF)
.remove(currentList.get(MAPPING_TYPE_REF).size() - 1);
assert currentObject.getType()
.equals(currentType) : "object expected to be "
+ currentObject.getType().getName() + ", not "
+ currentType.getName();
assert unknownAncestors == 0 : "error in administration "
+ currentObject.getType().getName();
// ignore text and realOffset: not relevant
idPositions.put(currentObject.getId(),
currentObject.getPositions());
idOffsets.put(currentObject.getId(),
currentObject.getOffset());
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors = currentObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
} else {
// this shouldn't happen
}
// check for wordAnnotation: because otherwise currentList
// should contain no wordAnnotations
} else if (!currentList.get(MAPPING_TYPE_WORD_ANNOTATION)
.isEmpty()) {
if ((currentType = wordAnnotationTypes.get(qname)) != null) {
currentObject = currentList.get(MAPPING_TYPE_WORD_ANNOTATION)
.remove(
currentList.get(MAPPING_TYPE_WORD_ANNOTATION).size()
- 1);
assert currentObject.getType()
.equals(currentType) : "object expected to be "
+ currentObject.getType().getName() + ", not "
+ currentType.getName();
assert unknownAncestors == 0 : "error in administration "
+ currentObject.getType().getName();
currentObject.setRealOffsetEnd(lastOffset);
idPositions.put(currentObject.getId(),
currentObject.getPositions());
idOffsets.put(currentObject.getId(),
currentObject.getOffset());
// offset always null, so update later with word (should be
// possible)
if ((currentObject.getId() != null)
&& (!currentList.get(MAPPING_TYPE_WORD).isEmpty())) {
currentList.get(MAPPING_TYPE_WORD)
.get((currentList.get(MAPPING_TYPE_WORD).size() - 1))
.addUpdateableIdWithOffset(currentObject.getId());
}
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors = currentObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
} else {
// this shouldn't happen
}
// check for word: because otherwise currentList should contain
// no words
} else if (!currentList.get(MAPPING_TYPE_WORD).isEmpty()) {
if ((currentType = wordTypes.get(qname)) != null) {
currentObject = currentList.get(MAPPING_TYPE_WORD)
.remove(currentList.get(MAPPING_TYPE_WORD).size() - 1);
assert currentObject.getType()
.equals(currentType) : "object expected to be "
+ currentObject.getType().getName() + ", not "
+ currentType.getName();
assert unknownAncestors == 0 : "error in administration "
+ currentObject.getType().getName();
currentObject.setOffsetEnd(lastOffset);
currentObject.setRealOffsetEnd(lastOffset);
// update ancestor groups with position and offset
for (MtasParserObject currentGroup : currentList
.get(MAPPING_TYPE_GROUP)) {
currentGroup.addPositions(currentObject.getPositions());
currentGroup.addOffsetStart(currentObject.getOffsetStart());
currentGroup.addOffsetEnd(currentObject.getOffsetEnd());
}
idPositions.put(currentObject.getId(),
currentObject.getPositions());
idOffsets.put(currentObject.getId(),
currentObject.getOffset());
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors = currentObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
} else {
// this shouldn't happen
}
// check for group annotation: because otherwise currentList
// should contain no groupAnnotations
} else if (!currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
.isEmpty()) {
if ((currentType = groupAnnotationTypes.get(qname)) != null) {
currentObject = currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
.remove(
currentList.get(MAPPING_TYPE_GROUP_ANNOTATION).size()
- 1);
assert currentObject.getType()
.equals(currentType) : "object expected to be "
+ currentObject.getType().getName() + ", not "
+ currentType.getName();
assert unknownAncestors == 0 : "error in administration "
+ currentObject.getType().getName();
currentObject.setRealOffsetEnd(lastOffset);
idPositions.put(currentObject.getId(),
currentObject.getPositions());
idOffsets.put(currentObject.getId(),
currentObject.getOffset());
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors = currentObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
} else {
// this shouldn't happen
}
// check for relation annotation
} else if (!currentList.get(MAPPING_TYPE_RELATION_ANNOTATION)
.isEmpty()) {
if ((currentType = relationAnnotationTypes
.get(qname)) != null) {
currentObject = currentList
.get(MAPPING_TYPE_RELATION_ANNOTATION).remove(currentList
.get(MAPPING_TYPE_RELATION_ANNOTATION).size() - 1);
assert currentObject.getType()
.equals(currentType) : "object expected to be "
+ currentObject.getType().getName() + ", not "
+ currentType.getName();
assert unknownAncestors == 0 : "error in administration "
+ currentObject.getType().getName();
currentObject.setRealOffsetEnd(lastOffset);
idPositions.put(currentObject.getId(),
currentObject.getPositions());
idOffsets.put(currentObject.getId(),
currentObject.getOffset());
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors = currentObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
} else {
// this shouldn't happen
}
// check for relation
} else if (!currentList.get(MAPPING_TYPE_RELATION).isEmpty()) {
if ((currentType = relationTypes.get(qname)) != null) {
currentObject = currentList.get(MAPPING_TYPE_RELATION).remove(
currentList.get(MAPPING_TYPE_RELATION).size() - 1);
assert currentObject.getType()
.equals(currentType) : "object expected to be "
+ currentObject.getType().getName() + ", not "
+ currentType.getName();
assert unknownAncestors == 0 : "error in administration "
+ currentObject.getType().getName();
// ignore text: should not occur
currentObject.setRealOffsetEnd(lastOffset);
idPositions.put(currentObject.getId(),
currentObject.getPositions());
idOffsets.put(currentObject.getId(),
currentObject.getOffset());
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors = currentObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
} else {
// this shouldn't happen
}
// check for group
} else if (!currentList.get(MAPPING_TYPE_GROUP).isEmpty()) {
if ((currentType = groupTypes.get(qname)) != null) {
currentObject = currentList.get(MAPPING_TYPE_GROUP)
.remove(currentList.get(MAPPING_TYPE_GROUP).size() - 1);
assert currentObject.getType()
.equals(currentType) : "object expected to be "
+ currentObject.getType().getName() + ", not "
+ currentType.getName();
assert unknownAncestors == 0 : "error in administration "
+ currentObject.getType().getName();
// ignore text: should not occur
currentObject.setRealOffsetEnd(lastOffset);
idPositions.put(currentObject.getId(),
currentObject.getPositions());
idOffsets.put(currentObject.getId(),
currentObject.getOffset());
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors = currentObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
} else {
unknownAncestors--;
}
} else if (qname.equals(getQName("text"))) {
parsingContent = false;
assert unknownAncestors == 0 : "error in administration unknownAncestors";
assert currentList.get(MAPPING_TYPE_REF)
.isEmpty() : "error in administration references";
assert currentList.get(MAPPING_TYPE_GROUP)
.isEmpty() : "error in administration groups";
assert currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
.isEmpty() : "error in administration groupAnnotations";
assert currentList.get(MAPPING_TYPE_WORD)
.isEmpty() : "error in administration words";
assert currentList.get(MAPPING_TYPE_WORD_ANNOTATION)
.isEmpty() : "error in administration wordAnnotations";
assert currentList.get(MAPPING_TYPE_RELATION)
.isEmpty() : "error in administration relations";
assert currentList.get(MAPPING_TYPE_RELATION_ANNOTATION)
.isEmpty() : "error in administration relationAnnotations";
}
}
// forget text
textContent = null;
break;
case XMLStreamConstants.CHARACTERS:
// set offset (end of start-element)
lastOffset = streamReader.getLocation().getCharacterOffset();
// check for text
if (streamReader.hasText()) {
textContent = streamReader.getText();
}
if (currentObject != null && unknownAncestors.equals(0)) {
currentObject.addText(textContent);
}
break;
default:
break;
}
if (!streamReader.hasNext()) {
break;
}
event = streamReader.next();
}
} finally {
streamReader.close();
}
// final checks
assert unknownAncestors == 0 : "error in administration unknownAncestors";
assert hasRoot : "no " + rootTag;
} catch (XMLStreamException e) {
log.debug(e);
throw new MtasParserException("No valid XML: " + e.getMessage());
}
// update tokens with variable
for (Entry<Integer, Set<String>> updateItem : updateList
.get(UPDATE_TYPE_VARIABLE).entrySet()) {
MtasToken token = tokenCollection.get(updateItem.getKey());
String encodedPrefix = token.getPrefix();
String encodedPostfix = token.getPostfix();
token.setValue(decodeAndUpdateWithVariables(encodedPrefix, encodedPostfix,
variables));
}
// update tokens with offset
for (Entry<Integer, Set<String>> updateItem : updateList
.get(UPDATE_TYPE_OFFSET).entrySet()) {
Set<String> refIdList = new HashSet<>();
for (String refId : updateItem.getValue()) {
if (idPositions.containsKey(refId)) {
refIdList.add(refId);
}
if (relationKeyMap.containsKey(refId)) {
refIdList.addAll(recursiveCollect(refId, relationKeyMap, 10));
}
}
for (String refId : refIdList) {
Integer[] refOffset = idOffsets.get(refId);
Integer tokenId = updateItem.getKey();
if (tokenId != null && refOffset != null) {
MtasToken token = tokenCollection.get(tokenId);
token.addOffset(refOffset[0], refOffset[1]);
}
}
}
// update tokens with position
for (Entry<Integer, Set<String>> updateItem : updateList
.get(UPDATE_TYPE_POSITION).entrySet()) {
HashSet<String> refIdList = new HashSet<>();
for (String refId : updateItem.getValue()) {
if (idPositions.containsKey(refId)) {
refIdList.add(refId);
}
if (relationKeyMap.containsKey(refId)) {
refIdList.addAll(recursiveCollect(refId, relationKeyMap, 10));
}
}
for (String refId : refIdList) {
Set<Integer> refPositions = idPositions.get(refId);
Integer tokenId = updateItem.getKey();
if (tokenId != null && refPositions != null) {
MtasToken token = tokenCollection.get(tokenId);
token.addPositions(refPositions);
}
}
}
// final check
tokenCollection.check(autorepair, makeunique);
return tokenCollection;
}
/**
* Recursive collect.
*
* @param refId the ref id
* @param relationKeyMap the relation key map
* @param maxRecursion the max recursion
* @return the collection<? extends string>
*/
private Collection<? extends String> recursiveCollect(String refId,
Map<String, SortedSet<String>> relationKeyMap, int maxRecursion) {
Set<String> list = new HashSet<>();
if (maxRecursion > 0 && relationKeyMap.containsKey(refId)) {
SortedSet<String> subList = relationKeyMap.get(refId);
for (String subRefId : subList) {
list.add(subRefId);
list.addAll(
recursiveCollect(subRefId, relationKeyMap, maxRecursion - 1));
}
}
return list;
}
/**
* Gets the q name.
*
* @param key the key
* @return the q name
*/
private QName getQName(String key) {
QName qname;
if ((qname = qNames.get(key)) == null) {
qname = new QName(namespaceURI, key);
qNames.put(key, qname);
}
return qname;
}
/**
* Collect attributes.
*
* @param currentObject the current object
* @param streamReader the stream reader
*/
public void collectAttributes(MtasParserObject currentObject,
XMLStreamReader streamReader) {
String attributeNamespaceURI;
currentObject.objectAttributes.clear();
currentObject.objectId = streamReader.getAttributeValue(namespaceURI_id,
"id");
for (int i = 0; i < streamReader.getAttributeCount(); i++) {
attributeNamespaceURI = streamReader.getAttributeNamespace(i);
if (attributeNamespaceURI == null || attributeNamespaceURI.equals("")) {
attributeNamespaceURI = streamReader.getNamespaceURI();
}
if (namespaceURI == null || attributeNamespaceURI.equals(namespaceURI)) {
currentObject.objectAttributes.put(
streamReader.getAttributeLocalName(i),
streamReader.getAttributeValue(i));
} else {
HashMap<String, String> otherMap;
if(!currentObject.objectOtherAttributes.containsKey(attributeNamespaceURI)) {
otherMap = new HashMap<>();
currentObject.objectOtherAttributes.put(attributeNamespaceURI, otherMap);
} else {
otherMap = currentObject.objectOtherAttributes.get(attributeNamespaceURI);
}
otherMap.put(
streamReader.getAttributeLocalName(i),
streamReader.getAttributeValue(i));
}
}
}
/**
* The Class MtasXMLParserMappingRelation.
*/
private class MtasXMLParserMappingRelation
extends MtasParserMapping<MtasXMLParserMappingRelation> {
/**
* Instantiates a new mtas XML parser mapping relation.
*/
public MtasXMLParserMappingRelation() {
super();
this.position = SOURCE_REFS;
this.realOffset = SOURCE_OWN;
this.offset = SOURCE_REFS;
this.type = MAPPING_TYPE_RELATION;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
*/
@Override
protected MtasXMLParserMappingRelation self() {
return this;
}
}
/**
* The Class MtasXMLParserMappingRelationAnnotation.
*/
private class MtasXMLParserMappingRelationAnnotation
extends MtasParserMapping<MtasXMLParserMappingRelationAnnotation> {
/**
* Instantiates a new mtas XML parser mapping relation annotation.
*/
public MtasXMLParserMappingRelationAnnotation() {
super();
this.position = SOURCE_ANCESTOR_RELATION;
this.realOffset = SOURCE_OWN;
this.offset = SOURCE_ANCESTOR_RELATION;
this.type = MAPPING_TYPE_RELATION_ANNOTATION;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
*/
@Override
protected MtasXMLParserMappingRelationAnnotation self() {
return this;
}
}
/**
* The Class MtasXMLParserMappingGroup.
*/
private class MtasXMLParserMappingGroup
extends MtasParserMapping<MtasXMLParserMappingGroup> {
/**
* Instantiates a new mtas XML parser mapping group.
*/
public MtasXMLParserMappingGroup() {
super();
this.position = SOURCE_OWN;
this.realOffset = SOURCE_OWN;
this.offset = SOURCE_OWN;
this.type = MAPPING_TYPE_GROUP;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
*/
@Override
protected MtasXMLParserMappingGroup self() {
return this;
}
}
/**
* The Class MtasXMLParserMappingGroupAnnotation.
*/
private class MtasXMLParserMappingGroupAnnotation
extends MtasParserMapping<MtasXMLParserMappingGroupAnnotation> {
/**
* Instantiates a new mtas XML parser mapping group annotation.
*/
public MtasXMLParserMappingGroupAnnotation() {
super();
this.position = SOURCE_ANCESTOR_GROUP;
this.realOffset = SOURCE_OWN;
this.offset = SOURCE_ANCESTOR_GROUP;
this.type = MAPPING_TYPE_GROUP_ANNOTATION;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
*/
@Override
protected MtasXMLParserMappingGroupAnnotation self() {
return this;
}
/*
* (non-Javadoc)
*
* @see
* mtas.analysis.parser.MtasBasicParser.MtasParserMapping#setStartEnd(java.
* lang.String, java.lang.String)
*/
@Override
protected void setStartEnd(String start, String end) {
super.setStartEnd(start, end);
if (start != null && end != null) {
position = SOURCE_REFS;
offset = SOURCE_REFS;
}
}
}
/**
* The Class MtasXMLParserMappingWord.
*/
private class MtasXMLParserMappingWord
extends MtasParserMapping<MtasXMLParserMappingWord> {
/**
* Instantiates a new mtas XML parser mapping word.
*/
public MtasXMLParserMappingWord() {
super();
this.position = SOURCE_OWN;
this.realOffset = SOURCE_OWN;
this.offset = SOURCE_OWN;
this.type = MAPPING_TYPE_WORD;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
*/
@Override
protected MtasXMLParserMappingWord self() {
return this;
}
}
/**
* The Class MtasXMLParserMappingWordAnnotation.
*/
private class MtasXMLParserMappingWordAnnotation
extends MtasParserMapping<MtasXMLParserMappingWordAnnotation> {
/**
* Instantiates a new mtas XML parser mapping word annotation.
*/
public MtasXMLParserMappingWordAnnotation() {
super();
this.position = SOURCE_OWN;
this.realOffset = SOURCE_OWN;
this.offset = SOURCE_ANCESTOR_WORD;
this.type = MAPPING_TYPE_WORD_ANNOTATION;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
*/
@Override
protected MtasXMLParserMappingWordAnnotation self() {
return this;
}
}
}