MtasCRMParser.java
package mtas.analysis.parser;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import mtas.analysis.token.MtasTokenCollection;
import mtas.analysis.token.MtasTokenIdFactory;
import mtas.analysis.util.MtasBufferedReader;
import mtas.analysis.util.MtasConfigException;
import mtas.analysis.util.MtasConfiguration;
import mtas.analysis.util.MtasParserException;
/**
* The Class MtasCRMParser.
*/
public class MtasCRMParser extends MtasBasicParser {
/** The Constant log. */
private static final Log log = LogFactory.getLog(MtasCRMParser.class);
/** The word type. */
private MtasParserType<MtasParserMapping<?>> wordType = null;
/** The word annotation types. */
private HashMap<String, MtasParserType<MtasParserMapping<?>>> wordAnnotationTypes = new HashMap<>();
/** The crm sentence types. */
private HashMap<String, MtasParserType<MtasParserMapping<?>>> crmSentenceTypes = new HashMap<>();
/** The crm clause types. */
private HashMap<String, MtasParserType<MtasParserMapping<?>>> crmClauseTypes = new HashMap<>();
/** The crm pair types. */
private HashMap<String, MtasParserType<MtasParserMapping<?>>> crmPairTypes = new HashMap<>();
/** The functions. */
private HashMap<String, HashMap<String, MtasCRMParserFunction>> functions = new HashMap<>();
private HashMap<Integer, HashMap<String, String>> filterReplace = new HashMap<>();
/** The Constant MAPPING_TYPE_CRM_SENTENCE. */
protected static final String MAPPING_TYPE_CRM_SENTENCE = "crmSentence";
/** The Constant MAPPING_TYPE_CRM_CLAUSE. */
protected static final String MAPPING_TYPE_CRM_CLAUSE = "crmClause";
/** The Constant MAPPING_TYPE_CRM_PAIR. */
protected static final String MAPPING_TYPE_CRM_PAIR = "crmPair";
protected static final String FILTER_TYPE_REPLACE = "replace";
/** The history pair. */
private HashMap<String, HashMap<String, MtasParserObject>> historyPair = new HashMap<>();
/** The pair pattern. */
Pattern pairPattern = Pattern.compile("^([b|e])([a-z])([0-9]+)$");
/**
* Instantiates a new mtas CRM parser.
*
* @param config the config
*/
public MtasCRMParser(MtasConfiguration config) {
super(config);
try {
initParser();
// System.out.print(printConfig());
} catch (MtasConfigException e) {
log.error(e);
}
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasParser#initParser()
*/
@SuppressWarnings("unchecked")
@Override
protected void initParser() throws MtasConfigException {
super.initParser();
if (config != null) {
// always word, no mappings
wordType = new MtasParserType<>(MAPPING_TYPE_WORD, null, false);
for (int i = 0; i < config.children.size(); i++) {
MtasConfiguration current = config.children.get(i);
if (current.name.equals("filters")) {
for (int j = 0; j < current.children.size(); j++) {
if (current.children.get(j).name.equals("filter")) {
MtasConfiguration filter = current.children.get(j);
String typeFilter = filter.attributes.get("type");
String nameFilter = filter.attributes.get("name");
if(typeFilter!=null) {
if(typeFilter.equals(FILTER_TYPE_REPLACE)) {
String value = filter.attributes.get("value");
String replace = filter.attributes.get("replace");
if(nameFilter!=null && value!=null && replace!=null) {
String[] names = nameFilter.split(Pattern.quote(","));
for(String name : names) {
try {
int nameInt = Integer.parseInt(name);
HashMap<String, String> nameMap;
if(!filterReplace.containsKey(nameInt)) {
nameMap = new HashMap<>();
filterReplace.put(nameInt, nameMap);
} else {
nameMap = filterReplace.get(nameInt);
}
nameMap.put(value, replace);
} catch (NumberFormatException e) {
log.info(e);
}
}
} else {
throw new MtasConfigException("no name, value or replace for filter "
+ typeFilter );
}
} else {
throw new MtasConfigException("unknown filter type "
+ typeFilter );
}
} else {
throw new MtasConfigException("no type provided for filter" );
}
}
}
} else if (current.name.equals("mappings")) {
for (int j = 0; j < current.children.size(); j++) {
if (current.children.get(j).name.equals("mapping")) {
MtasConfiguration mapping = current.children.get(j);
String typeMapping = mapping.attributes.get("type");
String nameMapping = mapping.attributes.get("name");
if ((typeMapping != null)) {
if (typeMapping.equals(MAPPING_TYPE_WORD)) {
MtasCRMParserMappingWordAnnotation m = new MtasCRMParserMappingWordAnnotation();
m.processConfig(mapping);
wordType.addItem(m);
} else if (typeMapping.equals(MAPPING_TYPE_WORD_ANNOTATION)
&& (nameMapping != null)) {
MtasCRMParserMappingWordAnnotation m = new MtasCRMParserMappingWordAnnotation();
m.processConfig(mapping);
if (wordAnnotationTypes.containsKey(nameMapping)) {
wordAnnotationTypes.get(nameMapping).addItem(m);
} else {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
typeMapping, nameMapping, false);
t.addItem(m);
wordAnnotationTypes.put(nameMapping, t);
}
} else if (typeMapping.equals(MAPPING_TYPE_CRM_SENTENCE)) {
MtasCRMParserMappingCRMSentence m = new MtasCRMParserMappingCRMSentence();
m.processConfig(mapping);
if (crmSentenceTypes.containsKey(nameMapping)) {
crmSentenceTypes.get(nameMapping).addItem(m);
} else {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
MAPPING_TYPE_GROUP, nameMapping, true);
t.addItem(m);
crmSentenceTypes.put(nameMapping, t);
}
} else if (typeMapping.equals(MAPPING_TYPE_CRM_CLAUSE)) {
MtasCRMParserMappingCRMSentence m = new MtasCRMParserMappingCRMSentence();
m.processConfig(mapping);
if (crmClauseTypes.containsKey(nameMapping)) {
crmClauseTypes.get(nameMapping).addItem(m);
} else {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
MAPPING_TYPE_GROUP, nameMapping, true);
t.addItem(m);
crmClauseTypes.put(nameMapping, t);
}
} else if (typeMapping.equals(MAPPING_TYPE_CRM_PAIR)) {
MtasCRMParserMappingCRMPair m = new MtasCRMParserMappingCRMPair();
m.processConfig(mapping);
if (crmPairTypes.containsKey(nameMapping)) {
crmPairTypes.get(nameMapping).addItem(m);
} else {
MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
MAPPING_TYPE_RELATION, nameMapping, true);
t.addItem(m);
crmPairTypes.put(nameMapping, t);
}
} else {
throw new MtasConfigException("unknown mapping type "
+ typeMapping + " or missing name");
}
}
}
}
} else if (current.name.equals("functions")) {
for (int j = 0; j < current.children.size(); j++) {
if (current.children.get(j).name.equals("function")) {
MtasConfiguration function = current.children.get(j);
String nameFunction = function.attributes.get("name");
String typeFunction = function.attributes.get("type");
String splitFunction = function.attributes.get("split");
if (nameFunction != null && typeFunction != null) {
MtasCRMParserFunction mtasCRMParserFunction = new MtasCRMParserFunction(
typeFunction, splitFunction);
if (!functions.containsKey(typeFunction)) {
functions.put(typeFunction,
new HashMap<String, MtasCRMParserFunction>());
}
functions.get(typeFunction).put(nameFunction,
mtasCRMParserFunction);
MtasConfiguration subCurrent = current.children.get(j);
for (int k = 0; k < subCurrent.children.size(); k++) {
if (subCurrent.children.get(k).name.equals("condition")) {
MtasConfiguration subSubCurrent = subCurrent.children
.get(k);
if (subSubCurrent.attributes.containsKey("value")) {
String[] valuesCondition = subSubCurrent.attributes
.get("value").split(Pattern.quote(","));
ArrayList<MtasCRMParserFunctionOutput> valueOutputList = new ArrayList<>();
for (int l = 0; l < subSubCurrent.children.size(); l++) {
if (subSubCurrent.children.get(l).name
.equals("output")) {
String valueOutput = subSubCurrent.children
.get(l).attributes.get("value");
String nameOutput = subSubCurrent.children
.get(l).attributes.get("name");
if (nameOutput != null) {
MtasCRMParserFunctionOutput o = new MtasCRMParserFunctionOutput(
nameOutput, valueOutput);
valueOutputList.add(o);
}
}
}
if (!valueOutputList.isEmpty()) {
for (String valueCondition : valuesCondition) {
if (mtasCRMParserFunction.output
.containsKey(valueCondition)) {
mtasCRMParserFunction.output.get(valueCondition)
.addAll(
(Collection<? extends MtasCRMParserFunctionOutput>) valueOutputList
.clone());
} else {
mtasCRMParserFunction.output.put(valueCondition,
(ArrayList<MtasCRMParserFunctionOutput>) valueOutputList
.clone());
}
}
}
}
}
}
}
}
}
}
}
}
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasParser#createTokenCollection(java.io.Reader)
*/
@Override
public MtasTokenCollection createTokenCollection(Reader reader)
throws MtasParserException, MtasConfigException {
AtomicInteger position = new AtomicInteger(0);
MtasCRMAncestors unknownAncestors = new MtasCRMAncestors();
Map<String, Set<Integer>> idPositions = new HashMap<>();
Map<String, Integer[]> idOffsets = new HashMap<>();
Map<String, Map<Integer, Set<String>>> updateList = createUpdateList();
Map<String, List<MtasParserObject>> currentList = createCurrentList();
tokenCollection = new MtasTokenCollection();
MtasTokenIdFactory mtasTokenIdFactory = new MtasTokenIdFactory();
try (MtasBufferedReader br = new MtasBufferedReader(reader)) {
String line;
int currentOffset;
int previousOffset = br.getPosition();
MtasParserObject currentObject;
Pattern headerPattern = Pattern.compile("^@ @ @(.*)$");
Pattern regularPattern = Pattern.compile(
"^([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+)$");
Matcher matcherHeader;
Matcher matcherRegular = null;
Set<MtasParserObject> newPreviousSentence = new HashSet<>();
Set<MtasParserObject> previousSentence = new HashSet<>();
Set<MtasParserObject> newPreviousClause = new HashSet<>();
Set<MtasParserObject> previousClause = new HashSet<>();
String[] matcherList = new String[8];
while ((line = br.readLine()) != null) {
currentOffset = br.getPosition();
matcherHeader = headerPattern.matcher(line.trim());
matcherRegular = regularPattern.matcher(line.trim());
if (matcherRegular.matches()) {
newPreviousSentence.clear();
matcherList = createMatcherList(matcherRegular);
for (int i = 4; i < 8; i++) {
List<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
Set<MtasParserObject> tmpList = processCRMSentence(
mtasTokenIdFactory, String.valueOf(i),
matcherList[i], currentOffset,
functionOutputList, unknownAncestors, currentList, updateList,
idPositions, idOffsets, previousSentence, previousClause);
if (tmpList != null) {
newPreviousSentence.addAll(tmpList);
}
for (MtasCRMParserFunctionOutput functionOutput : functionOutputList) {
tmpList = processCRMSentence(mtasTokenIdFactory,
functionOutput.name, functionOutput.value, currentOffset,
functionOutputList, unknownAncestors, currentList, updateList,
idPositions, idOffsets, previousSentence, previousClause);
if (tmpList != null) {
newPreviousSentence.addAll(tmpList);
}
}
}
if (!newPreviousSentence.isEmpty()) {
previousSentence.clear();
previousSentence.addAll(newPreviousSentence);
}
newPreviousClause.clear();
for (int i = 4; i < 8; i++) {
ArrayList<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
Set<MtasParserObject> tmpList = processCRMClause(mtasTokenIdFactory,
String.valueOf(i), matcherList[i], currentOffset,
functionOutputList, unknownAncestors, currentList, updateList,
idPositions, idOffsets, previousClause);
if (tmpList != null) {
newPreviousClause.addAll(tmpList);
}
for (MtasCRMParserFunctionOutput functionOutput : functionOutputList) {
tmpList = processCRMClause(mtasTokenIdFactory,
functionOutput.name, functionOutput.value, currentOffset,
functionOutputList, unknownAncestors, currentList, updateList,
idPositions, idOffsets, previousClause);
if (tmpList != null) {
newPreviousClause.addAll(tmpList);
}
}
}
if (!newPreviousClause.isEmpty()) {
previousClause.clear();
previousClause.addAll(newPreviousClause);
}
}
if (matcherRegular.matches() && !matcherHeader.matches()) {
matcherRegular = regularPattern.matcher(line.trim());
if (matcherRegular.matches()) {
// regular line - start word
currentObject = new MtasParserObject(wordType);
currentObject.setOffsetStart(previousOffset);
currentObject.setRealOffsetStart(previousOffset);
currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors.unknown++;
} else {
int p = position.getAndIncrement();
currentObject.addPosition(p);
currentObject.objectId = "word_" + p;
currentList.get(MAPPING_TYPE_WORD).add(currentObject);
unknownAncestors.unknown = 0;
// check for crmPair
for (int i = 0; i < 8; i++) {
List<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
processCRMPair(mtasTokenIdFactory, p, String.valueOf(i),
matcherList[i], currentOffset,
functionOutputList, unknownAncestors, currentList,
updateList, idPositions, idOffsets);
for (MtasCRMParserFunctionOutput functionOutput : functionOutputList) {
processCRMPair(mtasTokenIdFactory, p, functionOutput.name,
functionOutput.value, currentOffset, functionOutputList,
unknownAncestors, currentList, updateList, idPositions,
idOffsets);
}
}
// compute word annotations
for (int i = 0; i < 8; i++) {
ArrayList<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
functionOutputList
.addAll(processWordAnnotation(mtasTokenIdFactory,
String.valueOf(i), matcherList[i],
previousOffset, currentOffset, unknownAncestors,
currentList, updateList, idPositions, idOffsets));
for (MtasCRMParserFunctionOutput functionOutput : functionOutputList) {
processWordAnnotation(mtasTokenIdFactory, functionOutput.name,
functionOutput.value, previousOffset, currentOffset,
unknownAncestors, currentList, updateList, idPositions,
idOffsets);
}
}
}
// finish word
if (unknownAncestors.unknown > 0) {
unknownAncestors.unknown--;
} else {
currentObject = currentList.get(MAPPING_TYPE_WORD)
.remove(currentList.get(MAPPING_TYPE_WORD).size() - 1);
assert unknownAncestors.unknown == 0 : "error in administration "
+ currentObject.getType().getName();
currentObject.setText(null);
currentObject.setOffsetEnd(currentOffset - 1);
currentObject.setRealOffsetEnd(currentOffset - 1);
// update ancestor groups with position and offset
for (MtasParserObject currentGroup : currentList
.get(MAPPING_TYPE_GROUP)) {
currentGroup.addPositions(currentObject.getPositions());
currentGroup.addOffsetStart(currentObject.getOffsetStart());
currentGroup.addOffsetEnd(currentObject.getOffsetEnd());
}
idPositions.put(currentObject.getId(),
currentObject.getPositions());
idOffsets.put(currentObject.getId(), currentObject.getOffset());
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors.unknown = currentObject
.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
}
} else {
// System.out.println("PROBLEM: " + line);
}
}
previousOffset = br.getPosition();
}
closePrevious(mtasTokenIdFactory, previousSentence, previousOffset,
unknownAncestors, currentList, updateList, idPositions, idOffsets);
closePrevious(mtasTokenIdFactory, previousClause, previousOffset,
unknownAncestors, currentList, updateList, idPositions, idOffsets);
} catch (IOException e) {
log.debug(e);
throw new MtasParserException(e.getMessage());
}
// final check
tokenCollection.check(autorepair, makeunique);
return tokenCollection;
}
private String[] createMatcherList(Matcher matcher) {
String[] list = new String[8];
String value;
for(int i=0; i<8; i++) {
value = matcher.group((i+1));
if(filterReplace.containsKey(i)) {
for(Entry<String,String> entry : filterReplace.get(i).entrySet()) {
value = value.replaceAll(Pattern.quote(entry.getKey()), entry.getValue());
}
}
list[i] = value;
}
return list;
}
/**
* Process word annotation.
*
* @param mtasTokenIdFactory the mtas token id factory
* @param name the name
* @param text the text
* @param previousOffset the previous offset
* @param currentOffset the current offset
* @param unknownAncestors the unknown ancestors
* @param currentList the current list
* @param updateList the update list
* @param idPositions the id positions
* @param idOffsets the id offsets
* @return the list
* @throws MtasParserException the mtas parser exception
* @throws MtasConfigException the mtas config exception
*/
private List<MtasCRMParserFunctionOutput> processWordAnnotation(
MtasTokenIdFactory mtasTokenIdFactory, String name, String text,
Integer previousOffset, Integer currentOffset,
MtasCRMAncestors unknownAncestors,
Map<String, List<MtasParserObject>> currentList,
Map<String, Map<Integer, Set<String>>> updateList,
Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets)
throws MtasParserException, MtasConfigException {
MtasParserType tmpCurrentType;
MtasParserObject currentObject;
List<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
if ((tmpCurrentType = wordAnnotationTypes.get(name)) != null) {
// start word annotation
currentObject = new MtasParserObject(tmpCurrentType);
currentObject.setRealOffsetStart(previousOffset);
currentObject.addPositions(currentList.get(MAPPING_TYPE_WORD)
.get((currentList.get(MAPPING_TYPE_WORD).size() - 1)).getPositions());
currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors.unknown++;
} else {
currentList.get(MAPPING_TYPE_WORD_ANNOTATION).add(currentObject);
unknownAncestors.unknown = 0;
}
// finish word annotation
if (unknownAncestors.unknown > 0) {
unknownAncestors.unknown--;
} else {
currentObject = currentList.get(MAPPING_TYPE_WORD_ANNOTATION)
.remove(currentList.get(MAPPING_TYPE_WORD_ANNOTATION).size() - 1);
assert unknownAncestors.unknown == 0 : "error in administration "
+ currentObject.getType().getName();
if (functions.containsKey(MAPPING_TYPE_WORD_ANNOTATION)
&& functions.get(MAPPING_TYPE_WORD_ANNOTATION).containsKey(name)
&& text != null) {
MtasCRMParserFunction function = functions
.get(MAPPING_TYPE_WORD_ANNOTATION).get(name);
String[] value;
if (function.split != null) {
value = text.split(Pattern.quote(function.split));
} else {
value = new String[] { text };
}
for (int c = 0; c < value.length; c++) {
if (function.output.containsKey(value[c])) {
functionOutputList.addAll(function.output.get(value[c]));
}
}
}
currentObject.setText(text);
currentObject.setRealOffsetEnd(currentOffset - 1);
idPositions.put(currentObject.getId(), currentObject.getPositions());
idOffsets.put(currentObject.getId(), currentObject.getOffset());
// offset always null, so update later with word (should be possible)
if ((currentObject.getId() != null)
&& (!currentList.get(MAPPING_TYPE_WORD).isEmpty())) {
currentList.get(MAPPING_TYPE_WORD)
.get((currentList.get(MAPPING_TYPE_WORD).size() - 1))
.addUpdateableIdWithOffset(currentObject.getId());
}
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors.unknown = currentObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
}
}
return functionOutputList;
}
/**
* Process CRM sentence.
*
* @param mtasTokenIdFactory the mtas token id factory
* @param name the name
* @param text the text
* @param currentOffset the current offset
* @param functionOutputList the function output list
* @param unknownAncestors the unknown ancestors
* @param currentList the current list
* @param updateList the update list
* @param idPositions the id positions
* @param idOffsets the id offsets
* @param previous the previous
* @param previousClause the previous clause
* @return the sets the
* @throws MtasParserException the mtas parser exception
* @throws MtasConfigException the mtas config exception
*/
private Set<MtasParserObject> processCRMSentence(
MtasTokenIdFactory mtasTokenIdFactory, String name, String text,
Integer currentOffset,
List<MtasCRMParserFunctionOutput> functionOutputList,
MtasCRMAncestors unknownAncestors,
Map<String, List<MtasParserObject>> currentList,
Map<String, Map<Integer, Set<String>>> updateList,
Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets,
Set<MtasParserObject> previous, Set<MtasParserObject> previousClause)
throws MtasParserException, MtasConfigException {
MtasParserType tmpCurrentType;
MtasParserObject currentObject;
if ((tmpCurrentType = crmSentenceTypes.get(name)) != null) {
String filteredText = text.replaceAll("[^0-9\\-]", "");
currentObject = new MtasParserObject(tmpCurrentType);
currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
currentObject.setRealOffsetStart(currentOffset);
currentObject.setText(filteredText);
if (!prevalidateObject(currentObject, currentList)) {
return new HashSet<>();
} else {
closePrevious(mtasTokenIdFactory, previousClause, currentOffset,
unknownAncestors, currentList, updateList, idPositions, idOffsets);
closePrevious(mtasTokenIdFactory, previous, currentOffset,
unknownAncestors, currentList, updateList, idPositions, idOffsets);
previous.clear();
currentList.get(MAPPING_TYPE_GROUP).add(currentObject);
unknownAncestors.unknown = 0;
return new HashSet<>(Arrays.asList(currentObject));
}
}
return new HashSet<>();
}
/**
* Process CRM clause.
*
* @param mtasTokenIdFactory the mtas token id factory
* @param name the name
* @param text the text
* @param currentOffset the current offset
* @param functionOutputList the function output list
* @param unknownAncestors the unknown ancestors
* @param currentList the current list
* @param updateList the update list
* @param idPositions the id positions
* @param idOffsets the id offsets
* @param previous the previous
* @return the sets the
* @throws MtasParserException the mtas parser exception
* @throws MtasConfigException the mtas config exception
*/
private Set<MtasParserObject> processCRMClause(
MtasTokenIdFactory mtasTokenIdFactory, String name, String text,
Integer currentOffset,
List<MtasCRMParserFunctionOutput> functionOutputList,
MtasCRMAncestors unknownAncestors,
Map<String, List<MtasParserObject>> currentList,
Map<String, Map<Integer, Set<String>>> updateList,
Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets,
Set<MtasParserObject> previous)
throws MtasParserException, MtasConfigException {
MtasParserType tmpCurrentType;
MtasParserObject currentObject;
if ((tmpCurrentType = crmClauseTypes.get(name)) != null) {
String filteredText = text.replaceAll("[^0-9\\-]", "");
currentObject = new MtasParserObject(tmpCurrentType);
currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
currentObject.setRealOffsetStart(currentOffset);
currentObject.setText(filteredText);
if (!prevalidateObject(currentObject, currentList)) {
return new HashSet<>();
} else {
closePrevious(mtasTokenIdFactory, previous, currentOffset,
unknownAncestors, currentList, updateList, idPositions, idOffsets);
previous.clear();
currentList.get(MAPPING_TYPE_GROUP).add(currentObject);
unknownAncestors.unknown = 0;
return new HashSet<>(Arrays.asList(currentObject));
}
}
return new HashSet<>();
}
/**
* Close previous.
*
* @param mtasTokenIdFactory the mtas token id factory
* @param previous the previous
* @param currentOffset the current offset
* @param unknownAncestors the unknown ancestors
* @param currentList the current list
* @param updateList the update list
* @param idPositions the id positions
* @param idOffsets the id offsets
* @throws MtasParserException the mtas parser exception
* @throws MtasConfigException the mtas config exception
*/
private void closePrevious(MtasTokenIdFactory mtasTokenIdFactory,
Set<MtasParserObject> previous, Integer currentOffset,
MtasCRMAncestors unknownAncestors,
Map<String, List<MtasParserObject>> currentList,
Map<String, Map<Integer, Set<String>>> updateList,
Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets)
throws MtasParserException, MtasConfigException {
for (MtasParserObject previousObject : previous) {
previousObject.setRealOffsetEnd(currentOffset);
idPositions.put(previousObject.getId(), previousObject.getPositions());
idOffsets.put(previousObject.getId(), previousObject.getOffset());
previousObject.updateMappings(idPositions, idOffsets);
unknownAncestors.unknown = previousObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, previousObject, currentList,
updateList);
currentList.get(MAPPING_TYPE_GROUP).remove(previousObject);
}
}
/**
* Process CRM pair.
*
* @param mtasTokenIdFactory the mtas token id factory
* @param position the position
* @param name the name
* @param text the text
* @param currentOffset the current offset
* @param functionOutputList the function output list
* @param unknownAncestors the unknown ancestors
* @param currentList the current list
* @param updateList the update list
* @param idPositions the id positions
* @param idOffsets the id offsets
* @throws MtasParserException the mtas parser exception
* @throws MtasConfigException the mtas config exception
*/
private void processCRMPair(MtasTokenIdFactory mtasTokenIdFactory,
int position, String name, String text, Integer currentOffset,
List<MtasCRMParserFunctionOutput> functionOutputList,
MtasCRMAncestors unknownAncestors,
Map<String, List<MtasParserObject>> currentList,
Map<String, Map<Integer, Set<String>>> updateList,
Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets)
throws MtasParserException, MtasConfigException {
MtasParserType tmpCurrentType;
MtasParserObject currentObject;
if ((tmpCurrentType = crmPairTypes.get(name)) != null) {
// get history
HashMap<String, MtasParserObject> currentNamePairHistory;
if (!historyPair.containsKey(name)) {
currentNamePairHistory = new HashMap<>();
historyPair.put(name, currentNamePairHistory);
} else {
currentNamePairHistory = historyPair.get(name);
}
Matcher m = pairPattern.matcher(text);
if (m.find()) {
String thisKey = m.group(1) + m.group(2);
String otherKey = (m.group(1).equals("b") ? "e" : "b") + m.group(2);
if (currentNamePairHistory.containsKey(otherKey)) {
currentObject = currentNamePairHistory.remove(otherKey);
currentObject.setText(currentObject.getText() + "+" + text);
currentObject.addPosition(position);
processFunctions(name, text, MAPPING_TYPE_CRM_PAIR,
functionOutputList);
currentObject.setRealOffsetEnd(currentOffset + 1);
currentObject.setOffsetEnd(currentOffset + 1);
idPositions.put(currentObject.getId(), currentObject.getPositions());
idOffsets.put(currentObject.getId(), currentObject.getOffset());
currentObject.updateMappings(idPositions, idOffsets);
unknownAncestors.unknown = currentObject.getUnknownAncestorNumber();
computeMappingsFromObject(mtasTokenIdFactory, currentObject,
currentList, updateList);
} else {
currentObject = new MtasParserObject(tmpCurrentType);
currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
currentObject.setRealOffsetStart(currentOffset);
currentObject.setOffsetStart(currentOffset);
currentObject.setText(text);
currentObject.addPosition(position);
if (!prevalidateObject(currentObject, currentList)) {
unknownAncestors.unknown++;
} else {
currentNamePairHistory.put(thisKey, currentObject);
processFunctions(name, text, MAPPING_TYPE_CRM_PAIR,
functionOutputList);
currentObject.setRealOffsetEnd(currentOffset + 1);
currentObject.setOffsetEnd(currentOffset + 1);
idPositions.put(currentObject.getId(),
currentObject.getPositions());
idOffsets.put(currentObject.getId(), currentObject.getOffset());
// offset always null, so update later with word (should be
// possible)
if ((currentObject.getId() != null)
&& (!currentList.get(MAPPING_TYPE_WORD).isEmpty())) {
currentList.get(MAPPING_TYPE_WORD)
.get((currentList.get(MAPPING_TYPE_WORD).size() - 1))
.addUpdateableIdWithOffset(currentObject.getId());
}
}
}
}
}
}
/**
* Process functions.
*
* @param name the name
* @param text the text
* @param type the type
* @param functionOutputList the function output list
*/
private void processFunctions(String name, String text, String type,
List<MtasCRMParserFunctionOutput> functionOutputList) {
if (functions.containsKey(type) && functions.get(type).containsKey(name)
&& text != null) {
MtasCRMParserFunction function = functions.get(type).get(name);
String[] value;
if (function.split != null) {
value = text.split(Pattern.quote(function.split));
} else {
value = new String[] { text };
}
for (int c = 0; c < value.length; c++) {
boolean checkedEmpty = false;
if (value[c].equals("")) {
checkedEmpty = true;
}
if (function.output.containsKey(value[c])) {
ArrayList<MtasCRMParserFunctionOutput> list = function.output
.get(value[c]);
for (MtasCRMParserFunctionOutput listItem : list) {
functionOutputList.add(listItem.create(value[c]));
}
}
if (!checkedEmpty && function.output.containsKey("")) {
ArrayList<MtasCRMParserFunctionOutput> list = function.output.get("");
for (MtasCRMParserFunctionOutput listItem : list) {
functionOutputList.add(listItem.create(value[c]));
}
}
}
}
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasParser#printConfig()
*/
@Override
public String printConfig() {
StringBuilder text = new StringBuilder();
text.append("=== CONFIGURATION ===\n");
text.append("type: " + wordAnnotationTypes.size() + " x wordAnnotation");
text.append(printConfigTypes(wordAnnotationTypes));
text.append("=== CONFIGURATION ===\n");
return text.toString();
}
/**
* Prints the config types.
*
* @param types the types
* @return the string
*/
private String printConfigTypes(
HashMap<?, MtasParserType<MtasParserMapping<?>>> types) {
StringBuilder text = new StringBuilder();
for (Entry<?, MtasParserType<MtasParserMapping<?>>> entry : types
.entrySet()) {
text.append("- " + entry.getKey() + ": " + entry.getValue().items.size()
+ " mapping(s)\n");
for (int i = 0; i < entry.getValue().items.size(); i++) {
text.append("\t" + entry.getValue().items.get(i) + "\n");
}
}
return text.toString();
}
/**
* The Class MtasCRMAncestors.
*/
private static class MtasCRMAncestors {
/** The unknown. */
public int unknown = 0;
}
/**
* The Class MtasCRMParserFunction.
*/
private static class MtasCRMParserFunction {
/** The split. */
public String split;
/** The output. */
public Map<String, ArrayList<MtasCRMParserFunctionOutput>> output;
/**
* Instantiates a new mtas CRM parser function.
*
* @param type the type
* @param split the split
*/
public MtasCRMParserFunction(String type, String split) {
this.split = split;
output = new HashMap<>();
}
}
/**
* The Class MtasCRMParserFunctionOutput.
*/
private class MtasCRMParserFunctionOutput {
/** The name. */
public String name;
/** The value. */
public String value;
/**
* Instantiates a new mtas CRM parser function output.
*
* @param name the name
* @param value the value
*/
public MtasCRMParserFunctionOutput(String name, String value) {
this.name = name;
this.value = value;
}
/**
* Creates the.
*
* @param originalValue the original value
* @return the mtas CRM parser function output
*/
public MtasCRMParserFunctionOutput create(String originalValue) {
if (value != null) {
return this;
} else {
return new MtasCRMParserFunctionOutput(name, originalValue);
}
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return "MtasCRMParserFunctionOutput[" + name + "," + value + "]";
}
}
/**
* The Class MtasCRMParserMappingWordAnnotation.
*/
private class MtasCRMParserMappingWordAnnotation
extends MtasParserMapping<MtasCRMParserMappingWordAnnotation> {
/**
* Instantiates a new mtas CRM parser mapping word annotation.
*/
public MtasCRMParserMappingWordAnnotation() {
super();
this.position = SOURCE_OWN;
this.realOffset = SOURCE_OWN;
this.offset = SOURCE_ANCESTOR_WORD;
this.type = MAPPING_TYPE_WORD_ANNOTATION;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasParser.MtasParserMapping#self()
*/
@Override
protected MtasCRMParserMappingWordAnnotation self() {
return this;
}
}
/**
* The Class MtasCRMParserMappingCRMSentence.
*/
private class MtasCRMParserMappingCRMSentence
extends MtasParserMapping<MtasCRMParserMappingCRMSentence> {
/**
* Instantiates a new mtas CRM parser mapping CRM sentence.
*/
public MtasCRMParserMappingCRMSentence() {
super();
this.position = SOURCE_OWN;
this.realOffset = SOURCE_OWN;
this.offset = SOURCE_OWN;
this.type = MAPPING_TYPE_GROUP;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
*/
@Override
protected MtasCRMParserMappingCRMSentence self() {
return this;
}
}
/**
* The Class MtasCRMParserMappingCRMPair.
*/
private class MtasCRMParserMappingCRMPair
extends MtasParserMapping<MtasCRMParserMappingCRMPair> {
/**
* Instantiates a new mtas CRM parser mapping CRM pair.
*/
public MtasCRMParserMappingCRMPair() {
super();
this.position = SOURCE_OWN;
this.realOffset = SOURCE_OWN;
this.offset = SOURCE_OWN;
this.type = MAPPING_TYPE_RELATION;
}
/*
* (non-Javadoc)
*
* @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
*/
@Override
protected MtasCRMParserMappingCRMPair self() {
return this;
}
}
}