MtasXMLParser.java
- package mtas.analysis.parser;
- import java.io.Reader;
- import java.util.Collection;
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.List;
- import java.util.Map;
- import java.util.Map.Entry;
- import java.util.Set;
- import java.util.SortedSet;
- import java.util.TreeSet;
- import java.util.concurrent.atomic.AtomicInteger;
- import javax.xml.namespace.QName;
- import javax.xml.stream.XMLInputFactory;
- import javax.xml.stream.XMLStreamConstants;
- import javax.xml.stream.XMLStreamException;
- import javax.xml.stream.XMLStreamReader;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import mtas.analysis.token.MtasToken;
- import mtas.analysis.token.MtasTokenCollection;
- import mtas.analysis.token.MtasTokenIdFactory;
- import mtas.analysis.util.MtasConfigException;
- import mtas.analysis.util.MtasParserException;
- import mtas.analysis.util.MtasConfiguration;
- /**
- * The Class MtasXMLParser.
- */
- abstract class MtasXMLParser extends MtasBasicParser {
- /** The Constant log. */
- private static final Log log = LogFactory.getLog(MtasXMLParser.class);
- /** The namespace URI. */
- protected String namespaceURI = null;
- /** The namespace URI id. */
- protected String namespaceURI_id = null;
- /** The root tag. */
- protected String rootTag = null;
- /** The content tag. */
- protected String contentTag = null;
- /** The allow non content. */
- protected boolean allowNonContent = false;
- /** The relation key map. */
- private Map<String, SortedSet<String>> relationKeyMap = new HashMap<>();
- /** The q names. */
- private Map<String, QName> qNames = new HashMap<>();
- /** The relation types. */
- private Map<QName, MtasParserType<MtasParserMapping<?>>> relationTypes = new HashMap<>();
- /** The relation annotation types. */
- private Map<QName, MtasParserType<MtasParserMapping<?>>> relationAnnotationTypes = new HashMap<>();
- /** The ref types. */
- private Map<QName, MtasParserType<MtasParserMapping<?>>> refTypes = new HashMap<>();
- /** The group types. */
- private Map<QName, MtasParserType<MtasParserMapping<?>>> groupTypes = new HashMap<>();
- /** The group annotation types. */
- private Map<QName, MtasParserType<MtasParserMapping<?>>> groupAnnotationTypes = new HashMap<>();
- /** The word types. */
- private Map<QName, MtasParserType<MtasParserMapping<?>>> wordTypes = new HashMap<>();
- /** The word annotation types. */
- private Map<QName, MtasParserType<MtasParserMapping<?>>> wordAnnotationTypes = new HashMap<>();
- /** The variable types. */
- private Map<QName, MtasParserType<MtasParserVariable>> variableTypes = new HashMap<>();
- /** The Constant XML_VARIABLES. */
- private static final String XML_VARIABLES = "variables";
- /** The Constant XML_VARIABLE. */
- private static final String XML_VARIABLE = "variable";
- /** The Constant XML_VARIABLE_NAME. */
- private static final String XML_VARIABLE_NAME = "name";
- /** The Constant XML_VARIABLE_VALUE. */
- private static final String XML_VARIABLE_VALUE = "value";
- /** The Constant XML_REFERENCES. */
- private static final String XML_REFERENCES = "references";
- /** The Constant XML_REFERENCE. */
- private static final String XML_REFERENCE = "reference";
- /** The Constant XML_REFERENCE_NAME. */
- private static final String XML_REFERENCE_NAME = "name";
- /** The Constant XML_REFERENCE_REF. */
- private static final String XML_REFERENCE_REF = "ref";
- /** The Constant XML_MAPPINGS. */
- private static final String XML_MAPPINGS = "mappings";
- /** The Constant XML_MAPPING. */
- private static final String XML_MAPPING = "mapping";
- /** The Constant XML_MAPPING_TYPE. */
- private static final String XML_MAPPING_TYPE = "type";
- /** The Constant XML_MAPPING_NAME. */
- private static final String XML_MAPPING_NAME = "name";
- /**
- * Instantiates a new mtas XML parser.
- *
- * @param config the config
- */
- public MtasXMLParser(MtasConfiguration config) {
- super(config);
- try {
- initParser();
- // System.out.print(printConfig());
- } catch (MtasConfigException e) {
- log.error(e);
- }
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasParser#printConfig()
- */
- @Override
- public String printConfig() {
- StringBuilder text = new StringBuilder();
- text.append("=== CONFIGURATION ===\n");
- text.append("type: " + variableTypes.size() + " x variable\n");
- text.append(printConfigVariableTypes(variableTypes));
- text.append("type: " + groupTypes.size() + " x group\n");
- text.append(printConfigMappingTypes(groupTypes));
- text.append("type: " + groupAnnotationTypes.size() + " x groupAnnotation");
- text.append(printConfigMappingTypes(groupAnnotationTypes));
- text.append("type: " + wordTypes.size() + " x word\n");
- text.append(printConfigMappingTypes(wordTypes));
- text.append("type: " + wordAnnotationTypes.size() + " x wordAnnotation");
- text.append(printConfigMappingTypes(wordAnnotationTypes));
- text.append("type: " + relationTypes.size() + " x relation\n");
- text.append(printConfigMappingTypes(relationTypes));
- text.append(
- "type: " + relationAnnotationTypes.size() + " x relationAnnotation\n");
- text.append(printConfigMappingTypes(relationAnnotationTypes));
- text.append("type: " + refTypes.size() + " x references\n");
- text.append(printConfigMappingTypes(refTypes));
- text.append("=== CONFIGURATION ===\n");
- return text.toString();
- }
- /**
- * Prints the config mapping types.
- *
- * @param types the types
- * @return the string
- */
- private String printConfigMappingTypes(
- Map<QName, MtasParserType<MtasParserMapping<?>>> types) {
- StringBuilder text = new StringBuilder();
- for (Entry<QName, MtasParserType<MtasParserMapping<?>>> entry : types
- .entrySet()) {
- text.append("- " + entry.getKey().getLocalPart() + ": "
- + entry.getValue().items.size() + " mapping(s)\n");
- for (int i = 0; i < entry.getValue().items.size(); i++) {
- text.append("\t" + entry.getValue().items.get(i) + "\n");
- }
- }
- return text.toString();
- }
- /**
- * Prints the config variable types.
- *
- * @param types the types
- * @return the string
- */
- private String printConfigVariableTypes(
- Map<QName, MtasParserType<MtasParserVariable>> types) {
- StringBuilder text = new StringBuilder();
- for (Entry<QName, MtasParserType<MtasParserVariable>> entry : types
- .entrySet()) {
- text.append("- " + entry.getKey().getLocalPart() + ": "
- + entry.getValue().items.size() + " variables(s)\n");
- for (int i = 0; i < entry.getValue().items.size(); i++) {
- text.append("\t" + entry.getValue().items.get(i) + "\n");
- }
- }
- return text.toString();
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasParser#initParser()
- */
- @Override
- protected void initParser() throws MtasConfigException {
- super.initParser();
- if (config != null) {
- // find namespaceURI
- for (int i = 0; i < config.children.size(); i++) {
- MtasConfiguration current = config.children.get(i);
- if (current.name.equals("namespaceURI")) {
- namespaceURI = current.attributes.get("value");
- }
- }
- // loop again
- for (int i = 0; i < config.children.size(); i++) {
- MtasConfiguration current = config.children.get(i);
- if (current.name.equals(XML_VARIABLES)) {
- for (int j = 0; j < current.children.size(); j++) {
- if (current.children.get(j).name.equals(XML_VARIABLE)) {
- MtasConfiguration variable = current.children.get(j);
- String nameVariable = variable.attributes.get(XML_VARIABLE_NAME);
- String valueVariable = variable.attributes
- .get(XML_VARIABLE_VALUE);
- if ((nameVariable != null) && (valueVariable != null)) {
- MtasParserVariable v = new MtasParserVariable(nameVariable,
- valueVariable);
- v.processConfig(variable);
- QName qn = getQName(nameVariable);
- if (variableTypes.containsKey(qn)) {
- variableTypes.get(qn).addItem(v);
- } else {
- MtasParserType<MtasParserVariable> t = new MtasParserType<>(
- nameVariable, valueVariable, false);
- t.addItem(v);
- variableTypes.put(qn, t);
- }
- }
- }
- }
- } else if (current.name.equals(XML_REFERENCES)) {
- for (int j = 0; j < current.children.size(); j++) {
- if (current.children.get(j).name.equals(XML_REFERENCE)) {
- MtasConfiguration reference = current.children.get(j);
- String name = reference.attributes.get(XML_REFERENCE_NAME);
- String ref = reference.attributes.get(XML_REFERENCE_REF);
- if ((name != null) && (ref != null)) {
- MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
- MAPPING_TYPE_REF, name, false, ref);
- refTypes.put(getQName(t.getName()), t);
- }
- }
- }
- } else if (current.name.equals(XML_MAPPINGS)) {
- for (int j = 0; j < current.children.size(); j++) {
- if (current.children.get(j).name.equals(XML_MAPPING)) {
- MtasConfiguration mapping = current.children.get(j);
- String typeMapping = mapping.attributes.get(XML_MAPPING_TYPE);
- String nameMapping = mapping.attributes.get(XML_MAPPING_NAME);
- if ((typeMapping != null) && (nameMapping != null)) {
- if (typeMapping.equals(MAPPING_TYPE_RELATION)) {
- MtasXMLParserMappingRelation m = new MtasXMLParserMappingRelation();
- m.processConfig(mapping);
- QName qn = getQName(nameMapping);
- if (relationTypes.containsKey(qn)) {
- relationTypes.get(qn).addItem(m);
- } else {
- MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
- typeMapping, nameMapping, false);
- t.addItem(m);
- relationTypes.put(qn, t);
- }
- } else if (typeMapping
- .equals(MAPPING_TYPE_RELATION_ANNOTATION)) {
- MtasXMLParserMappingRelationAnnotation m = new MtasXMLParserMappingRelationAnnotation();
- m.processConfig(mapping);
- QName qn = getQName(nameMapping);
- if (relationAnnotationTypes.containsKey(qn)) {
- relationAnnotationTypes.get(qn).addItem(m);
- } else {
- MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
- typeMapping, nameMapping, false);
- t.addItem(m);
- relationAnnotationTypes.put(qn, t);
- }
- } else if (typeMapping.equals(MAPPING_TYPE_WORD)) {
- MtasXMLParserMappingWord m = new MtasXMLParserMappingWord();
- m.processConfig(mapping);
- QName qn = getQName(nameMapping);
- if (wordTypes.containsKey(qn)) {
- wordTypes.get(qn).addItem(m);
- } else {
- MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
- typeMapping, nameMapping, false);
- t.addItem(m);
- wordTypes.put(qn, t);
- }
- } else if (typeMapping.equals(MAPPING_TYPE_WORD_ANNOTATION)) {
- MtasXMLParserMappingWordAnnotation m = new MtasXMLParserMappingWordAnnotation();
- m.processConfig(mapping);
- QName qn = getQName(nameMapping);
- if (wordAnnotationTypes.containsKey(qn)) {
- wordAnnotationTypes.get(qn).addItem(m);
- } else {
- MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
- typeMapping, nameMapping, false);
- t.addItem(m);
- wordAnnotationTypes.put(qn, t);
- }
- } else if (typeMapping.equals(MAPPING_TYPE_GROUP)) {
- MtasXMLParserMappingGroup m = new MtasXMLParserMappingGroup();
- m.processConfig(mapping);
- QName qn = getQName(nameMapping);
- if (groupTypes.containsKey(qn)) {
- groupTypes.get(qn).addItem(m);
- } else {
- MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
- typeMapping, nameMapping, false);
- t.addItem(m);
- groupTypes.put(qn, t);
- }
- } else if (typeMapping.equals(MAPPING_TYPE_GROUP_ANNOTATION)) {
- MtasXMLParserMappingGroupAnnotation m = new MtasXMLParserMappingGroupAnnotation();
- m.processConfig(mapping);
- QName qn = getQName(nameMapping);
- if (groupAnnotationTypes.containsKey(qn)) {
- groupAnnotationTypes.get(qn).addItem(m);
- } else {
- MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
- typeMapping, nameMapping, false);
- t.addItem(m);
- groupAnnotationTypes.put(qn, t);
- }
- } else {
- throw new MtasConfigException(
- "unknown mapping type " + typeMapping);
- }
- }
- }
- }
- }
- }
- }
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasParser#createTokenCollection(java.io.Reader)
- */
- @Override
- public MtasTokenCollection createTokenCollection(Reader reader)
- throws MtasParserException, MtasConfigException {
- Boolean hasRoot = rootTag == null ? true : false;
- Boolean parsingContent = contentTag == null ? true : false;
- String textContent = null;
- Integer unknownAncestors = 0;
- Integer lastOffset = 0;
- AtomicInteger position = new AtomicInteger(0);
- Map<String, Set<Integer>> idPositions = new HashMap<>();
- Map<String, Integer[]> idOffsets = new HashMap<>();
- Map<String, Map<Integer, Set<String>>> updateList = createUpdateList();
- Map<String, List<MtasParserObject>> currentList = createCurrentList();
- Map<String, Map<String, String>> variables = createVariables();
- tokenCollection = new MtasTokenCollection();
- MtasTokenIdFactory mtasTokenIdFactory = new MtasTokenIdFactory();
- XMLInputFactory factory = XMLInputFactory.newInstance();
- try {
- XMLStreamReader streamReader = factory.createXMLStreamReader(reader);
- QName qname;
- try {
- int event = streamReader.getEventType();
- MtasParserType<?> currentType;
- MtasParserType<?> tmpCurrentType;
- MtasParserType<?> tmpVariableType;
- MtasParserObject currentObject = null;
- MtasParserObject variableObject = null;
- while (true) {
- switch (event) {
- case XMLStreamConstants.START_DOCUMENT:
- log.debug("start of document");
- String encodingScheme = streamReader.getCharacterEncodingScheme();
- if (encodingScheme == null) {
- //ignore for now
- log.info("No encodingScheme found, assume utf-8");
- //throw new MtasParserException("No encodingScheme found");
- } else if (!encodingScheme.equalsIgnoreCase("utf-8")) {
- throw new MtasParserException(
- "XML not UTF-8 encoded but '" + encodingScheme + "'");
- }
- break;
- case XMLStreamConstants.END_DOCUMENT:
- log.debug("end of document");
- break;
- case XMLStreamConstants.SPACE:
- // set offset (end of start-element)
- lastOffset = streamReader.getLocation().getCharacterOffset();
- break;
- case XMLStreamConstants.START_ELEMENT:
- // get data
- qname = streamReader.getName();
- // check for rootTag
- if (!hasRoot) {
- if (qname.equals(getQName(rootTag))) {
- hasRoot = true;
- } else {
- throw new MtasParserException("No " + rootTag);
- }
- // parse content
- } else {
- if ((tmpVariableType = variableTypes.get(qname)) != null) {
- variableObject = new MtasParserObject(tmpVariableType);
- collectAttributes(variableObject, streamReader);
- computeVariablesFromObject(variableObject, currentList,
- variables);
- }
- if (parsingContent) {
- // check for relation : not within word, not within
- // groupAnnotation
- if ((currentList.get(MAPPING_TYPE_WORD).isEmpty())
- && (currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
- .isEmpty())
- && (tmpCurrentType = relationTypes.get(qname)) != null) {
- currentObject = new MtasParserObject(tmpCurrentType);
- collectAttributes(currentObject, streamReader);
- currentObject.setUnknownAncestorNumber(unknownAncestors);
- currentObject.setRealOffsetStart(lastOffset);
- if (!prevalidateObject(currentObject, currentList)) {
- unknownAncestors++;
- } else {
- currentType = tmpCurrentType;
- currentList.get(MAPPING_TYPE_RELATION).add(currentObject);
- unknownAncestors = 0;
- }
- // check for relation annotation: not within word, but within
- // relation
- } else if ((currentList.get(MAPPING_TYPE_WORD).isEmpty())
- && (!currentList.get(MAPPING_TYPE_RELATION).isEmpty())
- && (tmpCurrentType = relationAnnotationTypes
- .get(qname)) != null) {
- currentObject = new MtasParserObject(tmpCurrentType);
- collectAttributes(currentObject, streamReader);
- currentObject.setUnknownAncestorNumber(unknownAncestors);
- currentObject.setRealOffsetStart(lastOffset);
- if (!prevalidateObject(currentObject, currentList)) {
- unknownAncestors++;
- } else {
- currentType = tmpCurrentType;
- currentList.get(MAPPING_TYPE_RELATION_ANNOTATION)
- .add(currentObject);
- unknownAncestors = 0;
- }
- // check for group: not within word, not within relation, not
- // within groupAnnotation
- } else if ((currentList.get(MAPPING_TYPE_WORD).isEmpty())
- && (currentList.get(MAPPING_TYPE_RELATION).isEmpty())
- && (currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
- .isEmpty())
- && (tmpCurrentType = groupTypes.get(qname)) != null) {
- currentObject = new MtasParserObject(tmpCurrentType);
- collectAttributes(currentObject, streamReader);
- currentObject.setUnknownAncestorNumber(unknownAncestors);
- currentObject.setRealOffsetStart(lastOffset);
- if (!prevalidateObject(currentObject, currentList)) {
- unknownAncestors++;
- } else {
- currentType = tmpCurrentType;
- currentList.get(MAPPING_TYPE_GROUP).add(currentObject);
- unknownAncestors = 0;
- }
- // check for group annotation: not within word, not within
- // relation, but within group
- } else if ((currentList.get(MAPPING_TYPE_WORD).isEmpty())
- && (currentList.get(MAPPING_TYPE_RELATION).isEmpty())
- && (!currentList.get(MAPPING_TYPE_GROUP).isEmpty())
- && (tmpCurrentType = groupAnnotationTypes
- .get(qname)) != null) {
- currentObject = new MtasParserObject(tmpCurrentType);
- collectAttributes(currentObject, streamReader);
- currentObject.setUnknownAncestorNumber(unknownAncestors);
- currentObject.setRealOffsetStart(lastOffset);
- if (!prevalidateObject(currentObject, currentList)) {
- unknownAncestors++;
- } else {
- currentType = tmpCurrentType;
- currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
- .add(currentObject);
- unknownAncestors = 0;
- }
- // check for word: not within relation, not within
- // groupAnnotation, not within word, not within wordAnnotation
- } else if ((currentList.get(MAPPING_TYPE_RELATION).isEmpty())
- && (currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
- .isEmpty())
- && (currentList.get(MAPPING_TYPE_WORD).isEmpty())
- && (currentList.get(MAPPING_TYPE_WORD_ANNOTATION).isEmpty())
- && (tmpCurrentType = wordTypes.get(qname)) != null) {
- currentObject = new MtasParserObject(tmpCurrentType);
- collectAttributes(currentObject, streamReader);
- currentObject.setUnknownAncestorNumber(unknownAncestors);
- currentObject.setOffsetStart(lastOffset);
- currentObject.setRealOffsetStart(lastOffset);
- if (!prevalidateObject(currentObject, currentList)) {
- unknownAncestors++;
- } else {
- currentType = tmpCurrentType;
- currentObject.addPosition(position.getAndIncrement());
- currentList.get(MAPPING_TYPE_WORD).add(currentObject);
- unknownAncestors = 0;
- }
- // check for word annotation: not within relation, not within
- // groupAnnotation, but within word
- } else if ((currentList.get(MAPPING_TYPE_RELATION).isEmpty())
- && (currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
- .isEmpty())
- && (!currentList.get(MAPPING_TYPE_WORD).isEmpty())
- && (tmpCurrentType = wordAnnotationTypes
- .get(qname)) != null) {
- currentObject = new MtasParserObject(tmpCurrentType);
- collectAttributes(currentObject, streamReader);
- currentObject.addPositions(currentList.get(MAPPING_TYPE_WORD)
- .get((currentList.get(MAPPING_TYPE_WORD).size() - 1))
- .getPositions());
- currentObject.setUnknownAncestorNumber(unknownAncestors);
- currentObject.setRealOffsetStart(lastOffset);
- if (!prevalidateObject(currentObject, currentList)) {
- unknownAncestors++;
- } else {
- currentType = tmpCurrentType;
- currentList.get(MAPPING_TYPE_WORD_ANNOTATION)
- .add(currentObject);
- unknownAncestors = 0;
- }
- // check for references: within relation
- } else if (!currentList.get(MAPPING_TYPE_RELATION).isEmpty()
- && (tmpCurrentType = refTypes.get(qname)) != null) {
- currentObject = new MtasParserObject(tmpCurrentType);
- collectAttributes(currentObject, streamReader);
- currentObject.setUnknownAncestorNumber(unknownAncestors);
- currentObject.setRealOffsetStart(lastOffset);
- if (!prevalidateObject(currentObject, currentList)) {
- unknownAncestors++;
- } else {
- currentType = tmpCurrentType;
- currentList.get(MAPPING_TYPE_REF).add(currentObject);
- unknownAncestors = 0;
- // add reference to ancestor relations
- for (MtasParserObject currentRelation : currentList
- .get(MAPPING_TYPE_RELATION)) {
- currentRelation.addRefId(currentObject
- .getAttribute(currentType.getRefAttributeName()));
- // register mapping for relation (for recursive relations)
- SortedSet<String> keyMapList;
- if (currentRelation.getId() != null) {
- if (relationKeyMap
- .containsKey(currentRelation.getId())) {
- keyMapList = relationKeyMap
- .get(currentRelation.getId());
- } else {
- keyMapList = new TreeSet<>();
- relationKeyMap.put(currentRelation.getId(),
- keyMapList);
- }
- keyMapList.add(currentObject
- .getAttribute(currentType.getRefAttributeName()));
- }
- }
- }
- } else {
- unknownAncestors++;
- }
- // check for start content
- } else if (qname.equals(getQName(contentTag))) {
- parsingContent = true;
- // unexpected
- } else if (!allowNonContent) {
- throw new MtasParserException(
- "Unexpected " + qname.getLocalPart() + " in document");
- }
- }
- // set offset (end of start-element)
- lastOffset = streamReader.getLocation().getCharacterOffset();
- break;
- case XMLStreamConstants.END_ELEMENT:
- // set offset (end of end-element)
- lastOffset = streamReader.getLocation().getCharacterOffset();
- // get data
- qname = streamReader.getName();
- // parse content
- if (parsingContent) {
- if (unknownAncestors > 0) {
- unknownAncestors--;
- // check for reference: because otherwise currentList should
- // contain no references
- } else if (!currentList.get(MAPPING_TYPE_REF).isEmpty()) {
- if ((currentType = refTypes.get(qname)) != null) {
- currentObject = currentList.get(MAPPING_TYPE_REF)
- .remove(currentList.get(MAPPING_TYPE_REF).size() - 1);
- assert currentObject.getType()
- .equals(currentType) : "object expected to be "
- + currentObject.getType().getName() + ", not "
- + currentType.getName();
- assert unknownAncestors == 0 : "error in administration "
- + currentObject.getType().getName();
- // ignore text and realOffset: not relevant
- idPositions.put(currentObject.getId(),
- currentObject.getPositions());
- idOffsets.put(currentObject.getId(),
- currentObject.getOffset());
- currentObject.updateMappings(idPositions, idOffsets);
- unknownAncestors = currentObject.getUnknownAncestorNumber();
- computeMappingsFromObject(mtasTokenIdFactory, currentObject,
- currentList, updateList);
- } else {
- // this shouldn't happen
- }
- // check for wordAnnotation: because otherwise currentList
- // should contain no wordAnnotations
- } else if (!currentList.get(MAPPING_TYPE_WORD_ANNOTATION)
- .isEmpty()) {
- if ((currentType = wordAnnotationTypes.get(qname)) != null) {
- currentObject = currentList.get(MAPPING_TYPE_WORD_ANNOTATION)
- .remove(
- currentList.get(MAPPING_TYPE_WORD_ANNOTATION).size()
- - 1);
- assert currentObject.getType()
- .equals(currentType) : "object expected to be "
- + currentObject.getType().getName() + ", not "
- + currentType.getName();
- assert unknownAncestors == 0 : "error in administration "
- + currentObject.getType().getName();
- currentObject.setRealOffsetEnd(lastOffset);
- idPositions.put(currentObject.getId(),
- currentObject.getPositions());
- idOffsets.put(currentObject.getId(),
- currentObject.getOffset());
- // offset always null, so update later with word (should be
- // possible)
- if ((currentObject.getId() != null)
- && (!currentList.get(MAPPING_TYPE_WORD).isEmpty())) {
- currentList.get(MAPPING_TYPE_WORD)
- .get((currentList.get(MAPPING_TYPE_WORD).size() - 1))
- .addUpdateableIdWithOffset(currentObject.getId());
- }
- currentObject.updateMappings(idPositions, idOffsets);
- unknownAncestors = currentObject.getUnknownAncestorNumber();
- computeMappingsFromObject(mtasTokenIdFactory, currentObject,
- currentList, updateList);
- } else {
- // this shouldn't happen
- }
- // check for word: because otherwise currentList should contain
- // no words
- } else if (!currentList.get(MAPPING_TYPE_WORD).isEmpty()) {
- if ((currentType = wordTypes.get(qname)) != null) {
- currentObject = currentList.get(MAPPING_TYPE_WORD)
- .remove(currentList.get(MAPPING_TYPE_WORD).size() - 1);
- assert currentObject.getType()
- .equals(currentType) : "object expected to be "
- + currentObject.getType().getName() + ", not "
- + currentType.getName();
- assert unknownAncestors == 0 : "error in administration "
- + currentObject.getType().getName();
- currentObject.setOffsetEnd(lastOffset);
- currentObject.setRealOffsetEnd(lastOffset);
- // update ancestor groups with position and offset
- for (MtasParserObject currentGroup : currentList
- .get(MAPPING_TYPE_GROUP)) {
- currentGroup.addPositions(currentObject.getPositions());
- currentGroup.addOffsetStart(currentObject.getOffsetStart());
- currentGroup.addOffsetEnd(currentObject.getOffsetEnd());
- }
- idPositions.put(currentObject.getId(),
- currentObject.getPositions());
- idOffsets.put(currentObject.getId(),
- currentObject.getOffset());
- currentObject.updateMappings(idPositions, idOffsets);
- unknownAncestors = currentObject.getUnknownAncestorNumber();
- computeMappingsFromObject(mtasTokenIdFactory, currentObject,
- currentList, updateList);
- } else {
- // this shouldn't happen
- }
- // check for group annotation: because otherwise currentList
- // should contain no groupAnnotations
- } else if (!currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
- .isEmpty()) {
- if ((currentType = groupAnnotationTypes.get(qname)) != null) {
- currentObject = currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
- .remove(
- currentList.get(MAPPING_TYPE_GROUP_ANNOTATION).size()
- - 1);
- assert currentObject.getType()
- .equals(currentType) : "object expected to be "
- + currentObject.getType().getName() + ", not "
- + currentType.getName();
- assert unknownAncestors == 0 : "error in administration "
- + currentObject.getType().getName();
- currentObject.setRealOffsetEnd(lastOffset);
- idPositions.put(currentObject.getId(),
- currentObject.getPositions());
- idOffsets.put(currentObject.getId(),
- currentObject.getOffset());
- currentObject.updateMappings(idPositions, idOffsets);
- unknownAncestors = currentObject.getUnknownAncestorNumber();
- computeMappingsFromObject(mtasTokenIdFactory, currentObject,
- currentList, updateList);
- } else {
- // this shouldn't happen
- }
- // check for relation annotation
- } else if (!currentList.get(MAPPING_TYPE_RELATION_ANNOTATION)
- .isEmpty()) {
- if ((currentType = relationAnnotationTypes
- .get(qname)) != null) {
- currentObject = currentList
- .get(MAPPING_TYPE_RELATION_ANNOTATION).remove(currentList
- .get(MAPPING_TYPE_RELATION_ANNOTATION).size() - 1);
- assert currentObject.getType()
- .equals(currentType) : "object expected to be "
- + currentObject.getType().getName() + ", not "
- + currentType.getName();
- assert unknownAncestors == 0 : "error in administration "
- + currentObject.getType().getName();
- currentObject.setRealOffsetEnd(lastOffset);
- idPositions.put(currentObject.getId(),
- currentObject.getPositions());
- idOffsets.put(currentObject.getId(),
- currentObject.getOffset());
- currentObject.updateMappings(idPositions, idOffsets);
- unknownAncestors = currentObject.getUnknownAncestorNumber();
- computeMappingsFromObject(mtasTokenIdFactory, currentObject,
- currentList, updateList);
- } else {
- // this shouldn't happen
- }
- // check for relation
- } else if (!currentList.get(MAPPING_TYPE_RELATION).isEmpty()) {
- if ((currentType = relationTypes.get(qname)) != null) {
- currentObject = currentList.get(MAPPING_TYPE_RELATION).remove(
- currentList.get(MAPPING_TYPE_RELATION).size() - 1);
- assert currentObject.getType()
- .equals(currentType) : "object expected to be "
- + currentObject.getType().getName() + ", not "
- + currentType.getName();
- assert unknownAncestors == 0 : "error in administration "
- + currentObject.getType().getName();
- // ignore text: should not occur
- currentObject.setRealOffsetEnd(lastOffset);
- idPositions.put(currentObject.getId(),
- currentObject.getPositions());
- idOffsets.put(currentObject.getId(),
- currentObject.getOffset());
- currentObject.updateMappings(idPositions, idOffsets);
- unknownAncestors = currentObject.getUnknownAncestorNumber();
- computeMappingsFromObject(mtasTokenIdFactory, currentObject,
- currentList, updateList);
- } else {
- // this shouldn't happen
- }
- // check for group
- } else if (!currentList.get(MAPPING_TYPE_GROUP).isEmpty()) {
- if ((currentType = groupTypes.get(qname)) != null) {
- currentObject = currentList.get(MAPPING_TYPE_GROUP)
- .remove(currentList.get(MAPPING_TYPE_GROUP).size() - 1);
- assert currentObject.getType()
- .equals(currentType) : "object expected to be "
- + currentObject.getType().getName() + ", not "
- + currentType.getName();
- assert unknownAncestors == 0 : "error in administration "
- + currentObject.getType().getName();
- // ignore text: should not occur
- currentObject.setRealOffsetEnd(lastOffset);
- idPositions.put(currentObject.getId(),
- currentObject.getPositions());
- idOffsets.put(currentObject.getId(),
- currentObject.getOffset());
- currentObject.updateMappings(idPositions, idOffsets);
- unknownAncestors = currentObject.getUnknownAncestorNumber();
- computeMappingsFromObject(mtasTokenIdFactory, currentObject,
- currentList, updateList);
- } else {
- unknownAncestors--;
- }
- } else if (qname.equals(getQName("text"))) {
- parsingContent = false;
- assert unknownAncestors == 0 : "error in administration unknownAncestors";
- assert currentList.get(MAPPING_TYPE_REF)
- .isEmpty() : "error in administration references";
- assert currentList.get(MAPPING_TYPE_GROUP)
- .isEmpty() : "error in administration groups";
- assert currentList.get(MAPPING_TYPE_GROUP_ANNOTATION)
- .isEmpty() : "error in administration groupAnnotations";
- assert currentList.get(MAPPING_TYPE_WORD)
- .isEmpty() : "error in administration words";
- assert currentList.get(MAPPING_TYPE_WORD_ANNOTATION)
- .isEmpty() : "error in administration wordAnnotations";
- assert currentList.get(MAPPING_TYPE_RELATION)
- .isEmpty() : "error in administration relations";
- assert currentList.get(MAPPING_TYPE_RELATION_ANNOTATION)
- .isEmpty() : "error in administration relationAnnotations";
- }
- }
- // forget text
- textContent = null;
- break;
- case XMLStreamConstants.CHARACTERS:
- // set offset (end of start-element)
- lastOffset = streamReader.getLocation().getCharacterOffset();
- // check for text
- if (streamReader.hasText()) {
- textContent = streamReader.getText();
- }
- if (currentObject != null && unknownAncestors.equals(0)) {
- currentObject.addText(textContent);
- }
- break;
- default:
- break;
- }
- if (!streamReader.hasNext()) {
- break;
- }
- event = streamReader.next();
- }
- } finally {
- streamReader.close();
- }
- // final checks
- assert unknownAncestors == 0 : "error in administration unknownAncestors";
- assert hasRoot : "no " + rootTag;
- } catch (XMLStreamException e) {
- log.debug(e);
- throw new MtasParserException("No valid XML: " + e.getMessage());
- }
- // update tokens with variable
- for (Entry<Integer, Set<String>> updateItem : updateList
- .get(UPDATE_TYPE_VARIABLE).entrySet()) {
- MtasToken token = tokenCollection.get(updateItem.getKey());
- String encodedPrefix = token.getPrefix();
- String encodedPostfix = token.getPostfix();
- token.setValue(decodeAndUpdateWithVariables(encodedPrefix, encodedPostfix,
- variables));
- }
- // update tokens with offset
- for (Entry<Integer, Set<String>> updateItem : updateList
- .get(UPDATE_TYPE_OFFSET).entrySet()) {
- Set<String> refIdList = new HashSet<>();
- for (String refId : updateItem.getValue()) {
- if (idPositions.containsKey(refId)) {
- refIdList.add(refId);
- }
- if (relationKeyMap.containsKey(refId)) {
- refIdList.addAll(recursiveCollect(refId, relationKeyMap, 10));
- }
- }
- for (String refId : refIdList) {
- Integer[] refOffset = idOffsets.get(refId);
- Integer tokenId = updateItem.getKey();
- if (tokenId != null && refOffset != null) {
- MtasToken token = tokenCollection.get(tokenId);
- token.addOffset(refOffset[0], refOffset[1]);
- }
- }
- }
- // update tokens with position
- for (Entry<Integer, Set<String>> updateItem : updateList
- .get(UPDATE_TYPE_POSITION).entrySet()) {
- HashSet<String> refIdList = new HashSet<>();
- for (String refId : updateItem.getValue()) {
- if (idPositions.containsKey(refId)) {
- refIdList.add(refId);
- }
- if (relationKeyMap.containsKey(refId)) {
- refIdList.addAll(recursiveCollect(refId, relationKeyMap, 10));
- }
- }
- for (String refId : refIdList) {
- Set<Integer> refPositions = idPositions.get(refId);
- Integer tokenId = updateItem.getKey();
- if (tokenId != null && refPositions != null) {
- MtasToken token = tokenCollection.get(tokenId);
- token.addPositions(refPositions);
- }
- }
- }
- // final check
- tokenCollection.check(autorepair, makeunique);
- return tokenCollection;
- }
- /**
- * Recursive collect.
- *
- * @param refId the ref id
- * @param relationKeyMap the relation key map
- * @param maxRecursion the max recursion
- * @return the collection<? extends string>
- */
- private Collection<? extends String> recursiveCollect(String refId,
- Map<String, SortedSet<String>> relationKeyMap, int maxRecursion) {
- Set<String> list = new HashSet<>();
- if (maxRecursion > 0 && relationKeyMap.containsKey(refId)) {
- SortedSet<String> subList = relationKeyMap.get(refId);
- for (String subRefId : subList) {
- list.add(subRefId);
- list.addAll(
- recursiveCollect(subRefId, relationKeyMap, maxRecursion - 1));
- }
- }
- return list;
- }
- /**
- * Gets the q name.
- *
- * @param key the key
- * @return the q name
- */
- private QName getQName(String key) {
- QName qname;
- if ((qname = qNames.get(key)) == null) {
- qname = new QName(namespaceURI, key);
- qNames.put(key, qname);
- }
- return qname;
- }
- /**
- * Collect attributes.
- *
- * @param currentObject the current object
- * @param streamReader the stream reader
- */
- public void collectAttributes(MtasParserObject currentObject,
- XMLStreamReader streamReader) {
- String attributeNamespaceURI;
- currentObject.objectAttributes.clear();
- currentObject.objectId = streamReader.getAttributeValue(namespaceURI_id,
- "id");
- for (int i = 0; i < streamReader.getAttributeCount(); i++) {
- attributeNamespaceURI = streamReader.getAttributeNamespace(i);
- if (attributeNamespaceURI == null || attributeNamespaceURI.equals("")) {
- attributeNamespaceURI = streamReader.getNamespaceURI();
- }
- if (namespaceURI == null || attributeNamespaceURI.equals(namespaceURI)) {
- currentObject.objectAttributes.put(
- streamReader.getAttributeLocalName(i),
- streamReader.getAttributeValue(i));
- } else {
- HashMap<String, String> otherMap;
- if(!currentObject.objectOtherAttributes.containsKey(attributeNamespaceURI)) {
- otherMap = new HashMap<>();
- currentObject.objectOtherAttributes.put(attributeNamespaceURI, otherMap);
- } else {
- otherMap = currentObject.objectOtherAttributes.get(attributeNamespaceURI);
- }
- otherMap.put(
- streamReader.getAttributeLocalName(i),
- streamReader.getAttributeValue(i));
- }
- }
- }
- /**
- * The Class MtasXMLParserMappingRelation.
- */
- private class MtasXMLParserMappingRelation
- extends MtasParserMapping<MtasXMLParserMappingRelation> {
- /**
- * Instantiates a new mtas XML parser mapping relation.
- */
- public MtasXMLParserMappingRelation() {
- super();
- this.position = SOURCE_REFS;
- this.realOffset = SOURCE_OWN;
- this.offset = SOURCE_REFS;
- this.type = MAPPING_TYPE_RELATION;
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
- */
- @Override
- protected MtasXMLParserMappingRelation self() {
- return this;
- }
- }
- /**
- * The Class MtasXMLParserMappingRelationAnnotation.
- */
- private class MtasXMLParserMappingRelationAnnotation
- extends MtasParserMapping<MtasXMLParserMappingRelationAnnotation> {
- /**
- * Instantiates a new mtas XML parser mapping relation annotation.
- */
- public MtasXMLParserMappingRelationAnnotation() {
- super();
- this.position = SOURCE_ANCESTOR_RELATION;
- this.realOffset = SOURCE_OWN;
- this.offset = SOURCE_ANCESTOR_RELATION;
- this.type = MAPPING_TYPE_RELATION_ANNOTATION;
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
- */
- @Override
- protected MtasXMLParserMappingRelationAnnotation self() {
- return this;
- }
- }
- /**
- * The Class MtasXMLParserMappingGroup.
- */
- private class MtasXMLParserMappingGroup
- extends MtasParserMapping<MtasXMLParserMappingGroup> {
- /**
- * Instantiates a new mtas XML parser mapping group.
- */
- public MtasXMLParserMappingGroup() {
- super();
- this.position = SOURCE_OWN;
- this.realOffset = SOURCE_OWN;
- this.offset = SOURCE_OWN;
- this.type = MAPPING_TYPE_GROUP;
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
- */
- @Override
- protected MtasXMLParserMappingGroup self() {
- return this;
- }
- }
- /**
- * The Class MtasXMLParserMappingGroupAnnotation.
- */
- private class MtasXMLParserMappingGroupAnnotation
- extends MtasParserMapping<MtasXMLParserMappingGroupAnnotation> {
- /**
- * Instantiates a new mtas XML parser mapping group annotation.
- */
- public MtasXMLParserMappingGroupAnnotation() {
- super();
- this.position = SOURCE_ANCESTOR_GROUP;
- this.realOffset = SOURCE_OWN;
- this.offset = SOURCE_ANCESTOR_GROUP;
- this.type = MAPPING_TYPE_GROUP_ANNOTATION;
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
- */
- @Override
- protected MtasXMLParserMappingGroupAnnotation self() {
- return this;
- }
- /*
- * (non-Javadoc)
- *
- * @see
- * mtas.analysis.parser.MtasBasicParser.MtasParserMapping#setStartEnd(java.
- * lang.String, java.lang.String)
- */
- @Override
- protected void setStartEnd(String start, String end) {
- super.setStartEnd(start, end);
- if (start != null && end != null) {
- position = SOURCE_REFS;
- offset = SOURCE_REFS;
- }
- }
- }
- /**
- * The Class MtasXMLParserMappingWord.
- */
- private class MtasXMLParserMappingWord
- extends MtasParserMapping<MtasXMLParserMappingWord> {
- /**
- * Instantiates a new mtas XML parser mapping word.
- */
- public MtasXMLParserMappingWord() {
- super();
- this.position = SOURCE_OWN;
- this.realOffset = SOURCE_OWN;
- this.offset = SOURCE_OWN;
- this.type = MAPPING_TYPE_WORD;
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
- */
- @Override
- protected MtasXMLParserMappingWord self() {
- return this;
- }
- }
- /**
- * The Class MtasXMLParserMappingWordAnnotation.
- */
- private class MtasXMLParserMappingWordAnnotation
- extends MtasParserMapping<MtasXMLParserMappingWordAnnotation> {
- /**
- * Instantiates a new mtas XML parser mapping word annotation.
- */
- public MtasXMLParserMappingWordAnnotation() {
- super();
- this.position = SOURCE_OWN;
- this.realOffset = SOURCE_OWN;
- this.offset = SOURCE_ANCESTOR_WORD;
- this.type = MAPPING_TYPE_WORD_ANNOTATION;
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
- */
- @Override
- protected MtasXMLParserMappingWordAnnotation self() {
- return this;
- }
- }
- }