MtasCRMParser.java
- package mtas.analysis.parser;
- import java.io.IOException;
- import java.io.Reader;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.Collection;
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.List;
- import java.util.Map;
- import java.util.Map.Entry;
- import java.util.Set;
- import java.util.concurrent.atomic.AtomicInteger;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import mtas.analysis.token.MtasTokenCollection;
- import mtas.analysis.token.MtasTokenIdFactory;
- import mtas.analysis.util.MtasBufferedReader;
- import mtas.analysis.util.MtasConfigException;
- import mtas.analysis.util.MtasConfiguration;
- import mtas.analysis.util.MtasParserException;
- /**
- * The Class MtasCRMParser.
- */
- public class MtasCRMParser extends MtasBasicParser {
- /** The Constant log. */
- private static final Log log = LogFactory.getLog(MtasCRMParser.class);
- /** The word type. */
- private MtasParserType<MtasParserMapping<?>> wordType = null;
- /** The word annotation types. */
- private HashMap<String, MtasParserType<MtasParserMapping<?>>> wordAnnotationTypes = new HashMap<>();
- /** The crm sentence types. */
- private HashMap<String, MtasParserType<MtasParserMapping<?>>> crmSentenceTypes = new HashMap<>();
- /** The crm clause types. */
- private HashMap<String, MtasParserType<MtasParserMapping<?>>> crmClauseTypes = new HashMap<>();
- /** The crm pair types. */
- private HashMap<String, MtasParserType<MtasParserMapping<?>>> crmPairTypes = new HashMap<>();
- /** The functions. */
- private HashMap<String, HashMap<String, MtasCRMParserFunction>> functions = new HashMap<>();
- private HashMap<Integer, HashMap<String, String>> filterReplace = new HashMap<>();
- /** The Constant MAPPING_TYPE_CRM_SENTENCE. */
- protected static final String MAPPING_TYPE_CRM_SENTENCE = "crmSentence";
- /** The Constant MAPPING_TYPE_CRM_CLAUSE. */
- protected static final String MAPPING_TYPE_CRM_CLAUSE = "crmClause";
- /** The Constant MAPPING_TYPE_CRM_PAIR. */
- protected static final String MAPPING_TYPE_CRM_PAIR = "crmPair";
- protected static final String FILTER_TYPE_REPLACE = "replace";
-
- /** The history pair. */
- private HashMap<String, HashMap<String, MtasParserObject>> historyPair = new HashMap<>();
- /** The pair pattern. */
- Pattern pairPattern = Pattern.compile("^([b|e])([a-z])([0-9]+)$");
- /**
- * Instantiates a new mtas CRM parser.
- *
- * @param config the config
- */
- public MtasCRMParser(MtasConfiguration config) {
- super(config);
- try {
- initParser();
- // System.out.print(printConfig());
- } catch (MtasConfigException e) {
- log.error(e);
- }
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasParser#initParser()
- */
- @SuppressWarnings("unchecked")
- @Override
- protected void initParser() throws MtasConfigException {
- super.initParser();
- if (config != null) {
- // always word, no mappings
- wordType = new MtasParserType<>(MAPPING_TYPE_WORD, null, false);
- for (int i = 0; i < config.children.size(); i++) {
- MtasConfiguration current = config.children.get(i);
- if (current.name.equals("filters")) {
- for (int j = 0; j < current.children.size(); j++) {
- if (current.children.get(j).name.equals("filter")) {
- MtasConfiguration filter = current.children.get(j);
- String typeFilter = filter.attributes.get("type");
- String nameFilter = filter.attributes.get("name");
- if(typeFilter!=null) {
- if(typeFilter.equals(FILTER_TYPE_REPLACE)) {
- String value = filter.attributes.get("value");
- String replace = filter.attributes.get("replace");
- if(nameFilter!=null && value!=null && replace!=null) {
- String[] names = nameFilter.split(Pattern.quote(","));
- for(String name : names) {
- try {
- int nameInt = Integer.parseInt(name);
- HashMap<String, String> nameMap;
- if(!filterReplace.containsKey(nameInt)) {
- nameMap = new HashMap<>();
- filterReplace.put(nameInt, nameMap);
- } else {
- nameMap = filterReplace.get(nameInt);
- }
- nameMap.put(value, replace);
- } catch (NumberFormatException e) {
- log.info(e);
- }
- }
- } else {
- throw new MtasConfigException("no name, value or replace for filter "
- + typeFilter );
- }
- } else {
- throw new MtasConfigException("unknown filter type "
- + typeFilter );
- }
- } else {
- throw new MtasConfigException("no type provided for filter" );
- }
- }
- }
- } else if (current.name.equals("mappings")) {
- for (int j = 0; j < current.children.size(); j++) {
- if (current.children.get(j).name.equals("mapping")) {
- MtasConfiguration mapping = current.children.get(j);
- String typeMapping = mapping.attributes.get("type");
- String nameMapping = mapping.attributes.get("name");
- if ((typeMapping != null)) {
- if (typeMapping.equals(MAPPING_TYPE_WORD)) {
- MtasCRMParserMappingWordAnnotation m = new MtasCRMParserMappingWordAnnotation();
- m.processConfig(mapping);
- wordType.addItem(m);
- } else if (typeMapping.equals(MAPPING_TYPE_WORD_ANNOTATION)
- && (nameMapping != null)) {
- MtasCRMParserMappingWordAnnotation m = new MtasCRMParserMappingWordAnnotation();
- m.processConfig(mapping);
- if (wordAnnotationTypes.containsKey(nameMapping)) {
- wordAnnotationTypes.get(nameMapping).addItem(m);
- } else {
- MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
- typeMapping, nameMapping, false);
- t.addItem(m);
- wordAnnotationTypes.put(nameMapping, t);
- }
- } else if (typeMapping.equals(MAPPING_TYPE_CRM_SENTENCE)) {
- MtasCRMParserMappingCRMSentence m = new MtasCRMParserMappingCRMSentence();
- m.processConfig(mapping);
- if (crmSentenceTypes.containsKey(nameMapping)) {
- crmSentenceTypes.get(nameMapping).addItem(m);
- } else {
- MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
- MAPPING_TYPE_GROUP, nameMapping, true);
- t.addItem(m);
- crmSentenceTypes.put(nameMapping, t);
- }
- } else if (typeMapping.equals(MAPPING_TYPE_CRM_CLAUSE)) {
- MtasCRMParserMappingCRMSentence m = new MtasCRMParserMappingCRMSentence();
- m.processConfig(mapping);
- if (crmClauseTypes.containsKey(nameMapping)) {
- crmClauseTypes.get(nameMapping).addItem(m);
- } else {
- MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
- MAPPING_TYPE_GROUP, nameMapping, true);
- t.addItem(m);
- crmClauseTypes.put(nameMapping, t);
- }
- } else if (typeMapping.equals(MAPPING_TYPE_CRM_PAIR)) {
- MtasCRMParserMappingCRMPair m = new MtasCRMParserMappingCRMPair();
- m.processConfig(mapping);
- if (crmPairTypes.containsKey(nameMapping)) {
- crmPairTypes.get(nameMapping).addItem(m);
- } else {
- MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
- MAPPING_TYPE_RELATION, nameMapping, true);
- t.addItem(m);
- crmPairTypes.put(nameMapping, t);
- }
- } else {
- throw new MtasConfigException("unknown mapping type "
- + typeMapping + " or missing name");
- }
- }
- }
- }
- } else if (current.name.equals("functions")) {
- for (int j = 0; j < current.children.size(); j++) {
- if (current.children.get(j).name.equals("function")) {
- MtasConfiguration function = current.children.get(j);
- String nameFunction = function.attributes.get("name");
- String typeFunction = function.attributes.get("type");
- String splitFunction = function.attributes.get("split");
- if (nameFunction != null && typeFunction != null) {
- MtasCRMParserFunction mtasCRMParserFunction = new MtasCRMParserFunction(
- typeFunction, splitFunction);
- if (!functions.containsKey(typeFunction)) {
- functions.put(typeFunction,
- new HashMap<String, MtasCRMParserFunction>());
- }
- functions.get(typeFunction).put(nameFunction,
- mtasCRMParserFunction);
- MtasConfiguration subCurrent = current.children.get(j);
- for (int k = 0; k < subCurrent.children.size(); k++) {
- if (subCurrent.children.get(k).name.equals("condition")) {
- MtasConfiguration subSubCurrent = subCurrent.children
- .get(k);
- if (subSubCurrent.attributes.containsKey("value")) {
- String[] valuesCondition = subSubCurrent.attributes
- .get("value").split(Pattern.quote(","));
- ArrayList<MtasCRMParserFunctionOutput> valueOutputList = new ArrayList<>();
- for (int l = 0; l < subSubCurrent.children.size(); l++) {
- if (subSubCurrent.children.get(l).name
- .equals("output")) {
- String valueOutput = subSubCurrent.children
- .get(l).attributes.get("value");
- String nameOutput = subSubCurrent.children
- .get(l).attributes.get("name");
- if (nameOutput != null) {
- MtasCRMParserFunctionOutput o = new MtasCRMParserFunctionOutput(
- nameOutput, valueOutput);
- valueOutputList.add(o);
- }
- }
- }
- if (!valueOutputList.isEmpty()) {
- for (String valueCondition : valuesCondition) {
- if (mtasCRMParserFunction.output
- .containsKey(valueCondition)) {
- mtasCRMParserFunction.output.get(valueCondition)
- .addAll(
- (Collection<? extends MtasCRMParserFunctionOutput>) valueOutputList
- .clone());
- } else {
- mtasCRMParserFunction.output.put(valueCondition,
- (ArrayList<MtasCRMParserFunctionOutput>) valueOutputList
- .clone());
- }
- }
- }
- }
- }
- }
- }
- }
- }
- }
- }
- }
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasParser#createTokenCollection(java.io.Reader)
- */
- @Override
- public MtasTokenCollection createTokenCollection(Reader reader)
- throws MtasParserException, MtasConfigException {
- AtomicInteger position = new AtomicInteger(0);
- MtasCRMAncestors unknownAncestors = new MtasCRMAncestors();
- Map<String, Set<Integer>> idPositions = new HashMap<>();
- Map<String, Integer[]> idOffsets = new HashMap<>();
- Map<String, Map<Integer, Set<String>>> updateList = createUpdateList();
- Map<String, List<MtasParserObject>> currentList = createCurrentList();
- tokenCollection = new MtasTokenCollection();
- MtasTokenIdFactory mtasTokenIdFactory = new MtasTokenIdFactory();
- try (MtasBufferedReader br = new MtasBufferedReader(reader)) {
- String line;
- int currentOffset;
- int previousOffset = br.getPosition();
- MtasParserObject currentObject;
- Pattern headerPattern = Pattern.compile("^@ @ @(.*)$");
- Pattern regularPattern = Pattern.compile(
- "^([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+)$");
- Matcher matcherHeader;
- Matcher matcherRegular = null;
- Set<MtasParserObject> newPreviousSentence = new HashSet<>();
- Set<MtasParserObject> previousSentence = new HashSet<>();
- Set<MtasParserObject> newPreviousClause = new HashSet<>();
- Set<MtasParserObject> previousClause = new HashSet<>();
- String[] matcherList = new String[8];
- while ((line = br.readLine()) != null) {
- currentOffset = br.getPosition();
- matcherHeader = headerPattern.matcher(line.trim());
- matcherRegular = regularPattern.matcher(line.trim());
- if (matcherRegular.matches()) {
- newPreviousSentence.clear();
- matcherList = createMatcherList(matcherRegular);
- for (int i = 4; i < 8; i++) {
- List<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
- Set<MtasParserObject> tmpList = processCRMSentence(
- mtasTokenIdFactory, String.valueOf(i),
- matcherList[i], currentOffset,
- functionOutputList, unknownAncestors, currentList, updateList,
- idPositions, idOffsets, previousSentence, previousClause);
- if (tmpList != null) {
- newPreviousSentence.addAll(tmpList);
- }
- for (MtasCRMParserFunctionOutput functionOutput : functionOutputList) {
- tmpList = processCRMSentence(mtasTokenIdFactory,
- functionOutput.name, functionOutput.value, currentOffset,
- functionOutputList, unknownAncestors, currentList, updateList,
- idPositions, idOffsets, previousSentence, previousClause);
- if (tmpList != null) {
- newPreviousSentence.addAll(tmpList);
- }
- }
- }
- if (!newPreviousSentence.isEmpty()) {
- previousSentence.clear();
- previousSentence.addAll(newPreviousSentence);
- }
- newPreviousClause.clear();
- for (int i = 4; i < 8; i++) {
- ArrayList<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
- Set<MtasParserObject> tmpList = processCRMClause(mtasTokenIdFactory,
- String.valueOf(i), matcherList[i], currentOffset,
- functionOutputList, unknownAncestors, currentList, updateList,
- idPositions, idOffsets, previousClause);
- if (tmpList != null) {
- newPreviousClause.addAll(tmpList);
- }
- for (MtasCRMParserFunctionOutput functionOutput : functionOutputList) {
- tmpList = processCRMClause(mtasTokenIdFactory,
- functionOutput.name, functionOutput.value, currentOffset,
- functionOutputList, unknownAncestors, currentList, updateList,
- idPositions, idOffsets, previousClause);
- if (tmpList != null) {
- newPreviousClause.addAll(tmpList);
- }
- }
- }
- if (!newPreviousClause.isEmpty()) {
- previousClause.clear();
- previousClause.addAll(newPreviousClause);
- }
- }
- if (matcherRegular.matches() && !matcherHeader.matches()) {
- matcherRegular = regularPattern.matcher(line.trim());
- if (matcherRegular.matches()) {
- // regular line - start word
- currentObject = new MtasParserObject(wordType);
- currentObject.setOffsetStart(previousOffset);
- currentObject.setRealOffsetStart(previousOffset);
- currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
- if (!prevalidateObject(currentObject, currentList)) {
- unknownAncestors.unknown++;
- } else {
- int p = position.getAndIncrement();
- currentObject.addPosition(p);
- currentObject.objectId = "word_" + p;
- currentList.get(MAPPING_TYPE_WORD).add(currentObject);
- unknownAncestors.unknown = 0;
- // check for crmPair
- for (int i = 0; i < 8; i++) {
- List<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
- processCRMPair(mtasTokenIdFactory, p, String.valueOf(i),
- matcherList[i], currentOffset,
- functionOutputList, unknownAncestors, currentList,
- updateList, idPositions, idOffsets);
- for (MtasCRMParserFunctionOutput functionOutput : functionOutputList) {
- processCRMPair(mtasTokenIdFactory, p, functionOutput.name,
- functionOutput.value, currentOffset, functionOutputList,
- unknownAncestors, currentList, updateList, idPositions,
- idOffsets);
- }
- }
- // compute word annotations
- for (int i = 0; i < 8; i++) {
- ArrayList<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
- functionOutputList
- .addAll(processWordAnnotation(mtasTokenIdFactory,
- String.valueOf(i), matcherList[i],
- previousOffset, currentOffset, unknownAncestors,
- currentList, updateList, idPositions, idOffsets));
- for (MtasCRMParserFunctionOutput functionOutput : functionOutputList) {
- processWordAnnotation(mtasTokenIdFactory, functionOutput.name,
- functionOutput.value, previousOffset, currentOffset,
- unknownAncestors, currentList, updateList, idPositions,
- idOffsets);
- }
- }
- }
- // finish word
- if (unknownAncestors.unknown > 0) {
- unknownAncestors.unknown--;
- } else {
- currentObject = currentList.get(MAPPING_TYPE_WORD)
- .remove(currentList.get(MAPPING_TYPE_WORD).size() - 1);
- assert unknownAncestors.unknown == 0 : "error in administration "
- + currentObject.getType().getName();
- currentObject.setText(null);
- currentObject.setOffsetEnd(currentOffset - 1);
- currentObject.setRealOffsetEnd(currentOffset - 1);
- // update ancestor groups with position and offset
- for (MtasParserObject currentGroup : currentList
- .get(MAPPING_TYPE_GROUP)) {
- currentGroup.addPositions(currentObject.getPositions());
- currentGroup.addOffsetStart(currentObject.getOffsetStart());
- currentGroup.addOffsetEnd(currentObject.getOffsetEnd());
- }
- idPositions.put(currentObject.getId(),
- currentObject.getPositions());
- idOffsets.put(currentObject.getId(), currentObject.getOffset());
- currentObject.updateMappings(idPositions, idOffsets);
- unknownAncestors.unknown = currentObject
- .getUnknownAncestorNumber();
- computeMappingsFromObject(mtasTokenIdFactory, currentObject,
- currentList, updateList);
- }
- } else {
- // System.out.println("PROBLEM: " + line);
- }
- }
- previousOffset = br.getPosition();
- }
- closePrevious(mtasTokenIdFactory, previousSentence, previousOffset,
- unknownAncestors, currentList, updateList, idPositions, idOffsets);
- closePrevious(mtasTokenIdFactory, previousClause, previousOffset,
- unknownAncestors, currentList, updateList, idPositions, idOffsets);
- } catch (IOException e) {
- log.debug(e);
- throw new MtasParserException(e.getMessage());
- }
- // final check
- tokenCollection.check(autorepair, makeunique);
- return tokenCollection;
- }
- private String[] createMatcherList(Matcher matcher) {
- String[] list = new String[8];
- String value;
- for(int i=0; i<8; i++) {
- value = matcher.group((i+1));
- if(filterReplace.containsKey(i)) {
- for(Entry<String,String> entry : filterReplace.get(i).entrySet()) {
- value = value.replaceAll(Pattern.quote(entry.getKey()), entry.getValue());
- }
- }
- list[i] = value;
- }
- return list;
- }
-
- /**
- * Process word annotation.
- *
- * @param mtasTokenIdFactory the mtas token id factory
- * @param name the name
- * @param text the text
- * @param previousOffset the previous offset
- * @param currentOffset the current offset
- * @param unknownAncestors the unknown ancestors
- * @param currentList the current list
- * @param updateList the update list
- * @param idPositions the id positions
- * @param idOffsets the id offsets
- * @return the list
- * @throws MtasParserException the mtas parser exception
- * @throws MtasConfigException the mtas config exception
- */
- private List<MtasCRMParserFunctionOutput> processWordAnnotation(
- MtasTokenIdFactory mtasTokenIdFactory, String name, String text,
- Integer previousOffset, Integer currentOffset,
- MtasCRMAncestors unknownAncestors,
- Map<String, List<MtasParserObject>> currentList,
- Map<String, Map<Integer, Set<String>>> updateList,
- Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets)
- throws MtasParserException, MtasConfigException {
- MtasParserType tmpCurrentType;
- MtasParserObject currentObject;
- List<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
- if ((tmpCurrentType = wordAnnotationTypes.get(name)) != null) {
- // start word annotation
- currentObject = new MtasParserObject(tmpCurrentType);
- currentObject.setRealOffsetStart(previousOffset);
- currentObject.addPositions(currentList.get(MAPPING_TYPE_WORD)
- .get((currentList.get(MAPPING_TYPE_WORD).size() - 1)).getPositions());
- currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
- if (!prevalidateObject(currentObject, currentList)) {
- unknownAncestors.unknown++;
- } else {
- currentList.get(MAPPING_TYPE_WORD_ANNOTATION).add(currentObject);
- unknownAncestors.unknown = 0;
- }
- // finish word annotation
- if (unknownAncestors.unknown > 0) {
- unknownAncestors.unknown--;
- } else {
- currentObject = currentList.get(MAPPING_TYPE_WORD_ANNOTATION)
- .remove(currentList.get(MAPPING_TYPE_WORD_ANNOTATION).size() - 1);
- assert unknownAncestors.unknown == 0 : "error in administration "
- + currentObject.getType().getName();
- if (functions.containsKey(MAPPING_TYPE_WORD_ANNOTATION)
- && functions.get(MAPPING_TYPE_WORD_ANNOTATION).containsKey(name)
- && text != null) {
- MtasCRMParserFunction function = functions
- .get(MAPPING_TYPE_WORD_ANNOTATION).get(name);
- String[] value;
- if (function.split != null) {
- value = text.split(Pattern.quote(function.split));
- } else {
- value = new String[] { text };
- }
- for (int c = 0; c < value.length; c++) {
- if (function.output.containsKey(value[c])) {
- functionOutputList.addAll(function.output.get(value[c]));
- }
- }
- }
- currentObject.setText(text);
- currentObject.setRealOffsetEnd(currentOffset - 1);
- idPositions.put(currentObject.getId(), currentObject.getPositions());
- idOffsets.put(currentObject.getId(), currentObject.getOffset());
- // offset always null, so update later with word (should be possible)
- if ((currentObject.getId() != null)
- && (!currentList.get(MAPPING_TYPE_WORD).isEmpty())) {
- currentList.get(MAPPING_TYPE_WORD)
- .get((currentList.get(MAPPING_TYPE_WORD).size() - 1))
- .addUpdateableIdWithOffset(currentObject.getId());
- }
- currentObject.updateMappings(idPositions, idOffsets);
- unknownAncestors.unknown = currentObject.getUnknownAncestorNumber();
- computeMappingsFromObject(mtasTokenIdFactory, currentObject,
- currentList, updateList);
- }
- }
- return functionOutputList;
- }
- /**
- * Process CRM sentence.
- *
- * @param mtasTokenIdFactory the mtas token id factory
- * @param name the name
- * @param text the text
- * @param currentOffset the current offset
- * @param functionOutputList the function output list
- * @param unknownAncestors the unknown ancestors
- * @param currentList the current list
- * @param updateList the update list
- * @param idPositions the id positions
- * @param idOffsets the id offsets
- * @param previous the previous
- * @param previousClause the previous clause
- * @return the sets the
- * @throws MtasParserException the mtas parser exception
- * @throws MtasConfigException the mtas config exception
- */
- private Set<MtasParserObject> processCRMSentence(
- MtasTokenIdFactory mtasTokenIdFactory, String name, String text,
- Integer currentOffset,
- List<MtasCRMParserFunctionOutput> functionOutputList,
- MtasCRMAncestors unknownAncestors,
- Map<String, List<MtasParserObject>> currentList,
- Map<String, Map<Integer, Set<String>>> updateList,
- Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets,
- Set<MtasParserObject> previous, Set<MtasParserObject> previousClause)
- throws MtasParserException, MtasConfigException {
- MtasParserType tmpCurrentType;
- MtasParserObject currentObject;
- if ((tmpCurrentType = crmSentenceTypes.get(name)) != null) {
- String filteredText = text.replaceAll("[^0-9\\-]", "");
- currentObject = new MtasParserObject(tmpCurrentType);
- currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
- currentObject.setRealOffsetStart(currentOffset);
- currentObject.setText(filteredText);
- if (!prevalidateObject(currentObject, currentList)) {
- return new HashSet<>();
- } else {
- closePrevious(mtasTokenIdFactory, previousClause, currentOffset,
- unknownAncestors, currentList, updateList, idPositions, idOffsets);
- closePrevious(mtasTokenIdFactory, previous, currentOffset,
- unknownAncestors, currentList, updateList, idPositions, idOffsets);
- previous.clear();
- currentList.get(MAPPING_TYPE_GROUP).add(currentObject);
- unknownAncestors.unknown = 0;
- return new HashSet<>(Arrays.asList(currentObject));
- }
- }
- return new HashSet<>();
- }
- /**
- * Process CRM clause.
- *
- * @param mtasTokenIdFactory the mtas token id factory
- * @param name the name
- * @param text the text
- * @param currentOffset the current offset
- * @param functionOutputList the function output list
- * @param unknownAncestors the unknown ancestors
- * @param currentList the current list
- * @param updateList the update list
- * @param idPositions the id positions
- * @param idOffsets the id offsets
- * @param previous the previous
- * @return the sets the
- * @throws MtasParserException the mtas parser exception
- * @throws MtasConfigException the mtas config exception
- */
- private Set<MtasParserObject> processCRMClause(
- MtasTokenIdFactory mtasTokenIdFactory, String name, String text,
- Integer currentOffset,
- List<MtasCRMParserFunctionOutput> functionOutputList,
- MtasCRMAncestors unknownAncestors,
- Map<String, List<MtasParserObject>> currentList,
- Map<String, Map<Integer, Set<String>>> updateList,
- Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets,
- Set<MtasParserObject> previous)
- throws MtasParserException, MtasConfigException {
- MtasParserType tmpCurrentType;
- MtasParserObject currentObject;
- if ((tmpCurrentType = crmClauseTypes.get(name)) != null) {
- String filteredText = text.replaceAll("[^0-9\\-]", "");
- currentObject = new MtasParserObject(tmpCurrentType);
- currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
- currentObject.setRealOffsetStart(currentOffset);
- currentObject.setText(filteredText);
- if (!prevalidateObject(currentObject, currentList)) {
- return new HashSet<>();
- } else {
- closePrevious(mtasTokenIdFactory, previous, currentOffset,
- unknownAncestors, currentList, updateList, idPositions, idOffsets);
- previous.clear();
- currentList.get(MAPPING_TYPE_GROUP).add(currentObject);
- unknownAncestors.unknown = 0;
- return new HashSet<>(Arrays.asList(currentObject));
- }
- }
- return new HashSet<>();
- }
- /**
- * Close previous.
- *
- * @param mtasTokenIdFactory the mtas token id factory
- * @param previous the previous
- * @param currentOffset the current offset
- * @param unknownAncestors the unknown ancestors
- * @param currentList the current list
- * @param updateList the update list
- * @param idPositions the id positions
- * @param idOffsets the id offsets
- * @throws MtasParserException the mtas parser exception
- * @throws MtasConfigException the mtas config exception
- */
- private void closePrevious(MtasTokenIdFactory mtasTokenIdFactory,
- Set<MtasParserObject> previous, Integer currentOffset,
- MtasCRMAncestors unknownAncestors,
- Map<String, List<MtasParserObject>> currentList,
- Map<String, Map<Integer, Set<String>>> updateList,
- Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets)
- throws MtasParserException, MtasConfigException {
- for (MtasParserObject previousObject : previous) {
- previousObject.setRealOffsetEnd(currentOffset);
- idPositions.put(previousObject.getId(), previousObject.getPositions());
- idOffsets.put(previousObject.getId(), previousObject.getOffset());
- previousObject.updateMappings(idPositions, idOffsets);
- unknownAncestors.unknown = previousObject.getUnknownAncestorNumber();
- computeMappingsFromObject(mtasTokenIdFactory, previousObject, currentList,
- updateList);
- currentList.get(MAPPING_TYPE_GROUP).remove(previousObject);
- }
- }
- /**
- * Process CRM pair.
- *
- * @param mtasTokenIdFactory the mtas token id factory
- * @param position the position
- * @param name the name
- * @param text the text
- * @param currentOffset the current offset
- * @param functionOutputList the function output list
- * @param unknownAncestors the unknown ancestors
- * @param currentList the current list
- * @param updateList the update list
- * @param idPositions the id positions
- * @param idOffsets the id offsets
- * @throws MtasParserException the mtas parser exception
- * @throws MtasConfigException the mtas config exception
- */
- private void processCRMPair(MtasTokenIdFactory mtasTokenIdFactory,
- int position, String name, String text, Integer currentOffset,
- List<MtasCRMParserFunctionOutput> functionOutputList,
- MtasCRMAncestors unknownAncestors,
- Map<String, List<MtasParserObject>> currentList,
- Map<String, Map<Integer, Set<String>>> updateList,
- Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets)
- throws MtasParserException, MtasConfigException {
- MtasParserType tmpCurrentType;
- MtasParserObject currentObject;
- if ((tmpCurrentType = crmPairTypes.get(name)) != null) {
- // get history
- HashMap<String, MtasParserObject> currentNamePairHistory;
- if (!historyPair.containsKey(name)) {
- currentNamePairHistory = new HashMap<>();
- historyPair.put(name, currentNamePairHistory);
- } else {
- currentNamePairHistory = historyPair.get(name);
- }
- Matcher m = pairPattern.matcher(text);
- if (m.find()) {
- String thisKey = m.group(1) + m.group(2);
- String otherKey = (m.group(1).equals("b") ? "e" : "b") + m.group(2);
- if (currentNamePairHistory.containsKey(otherKey)) {
- currentObject = currentNamePairHistory.remove(otherKey);
- currentObject.setText(currentObject.getText() + "+" + text);
- currentObject.addPosition(position);
- processFunctions(name, text, MAPPING_TYPE_CRM_PAIR,
- functionOutputList);
- currentObject.setRealOffsetEnd(currentOffset + 1);
- currentObject.setOffsetEnd(currentOffset + 1);
- idPositions.put(currentObject.getId(), currentObject.getPositions());
- idOffsets.put(currentObject.getId(), currentObject.getOffset());
- currentObject.updateMappings(idPositions, idOffsets);
- unknownAncestors.unknown = currentObject.getUnknownAncestorNumber();
- computeMappingsFromObject(mtasTokenIdFactory, currentObject,
- currentList, updateList);
- } else {
- currentObject = new MtasParserObject(tmpCurrentType);
- currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
- currentObject.setRealOffsetStart(currentOffset);
- currentObject.setOffsetStart(currentOffset);
- currentObject.setText(text);
- currentObject.addPosition(position);
- if (!prevalidateObject(currentObject, currentList)) {
- unknownAncestors.unknown++;
- } else {
- currentNamePairHistory.put(thisKey, currentObject);
- processFunctions(name, text, MAPPING_TYPE_CRM_PAIR,
- functionOutputList);
- currentObject.setRealOffsetEnd(currentOffset + 1);
- currentObject.setOffsetEnd(currentOffset + 1);
- idPositions.put(currentObject.getId(),
- currentObject.getPositions());
- idOffsets.put(currentObject.getId(), currentObject.getOffset());
- // offset always null, so update later with word (should be
- // possible)
- if ((currentObject.getId() != null)
- && (!currentList.get(MAPPING_TYPE_WORD).isEmpty())) {
- currentList.get(MAPPING_TYPE_WORD)
- .get((currentList.get(MAPPING_TYPE_WORD).size() - 1))
- .addUpdateableIdWithOffset(currentObject.getId());
- }
- }
- }
- }
- }
- }
- /**
- * Process functions.
- *
- * @param name the name
- * @param text the text
- * @param type the type
- * @param functionOutputList the function output list
- */
- private void processFunctions(String name, String text, String type,
- List<MtasCRMParserFunctionOutput> functionOutputList) {
- if (functions.containsKey(type) && functions.get(type).containsKey(name)
- && text != null) {
- MtasCRMParserFunction function = functions.get(type).get(name);
- String[] value;
- if (function.split != null) {
- value = text.split(Pattern.quote(function.split));
- } else {
- value = new String[] { text };
- }
- for (int c = 0; c < value.length; c++) {
- boolean checkedEmpty = false;
- if (value[c].equals("")) {
- checkedEmpty = true;
- }
- if (function.output.containsKey(value[c])) {
- ArrayList<MtasCRMParserFunctionOutput> list = function.output
- .get(value[c]);
- for (MtasCRMParserFunctionOutput listItem : list) {
- functionOutputList.add(listItem.create(value[c]));
- }
- }
- if (!checkedEmpty && function.output.containsKey("")) {
- ArrayList<MtasCRMParserFunctionOutput> list = function.output.get("");
- for (MtasCRMParserFunctionOutput listItem : list) {
- functionOutputList.add(listItem.create(value[c]));
- }
- }
- }
- }
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasParser#printConfig()
- */
- @Override
- public String printConfig() {
- StringBuilder text = new StringBuilder();
- text.append("=== CONFIGURATION ===\n");
- text.append("type: " + wordAnnotationTypes.size() + " x wordAnnotation");
- text.append(printConfigTypes(wordAnnotationTypes));
- text.append("=== CONFIGURATION ===\n");
- return text.toString();
- }
- /**
- * Prints the config types.
- *
- * @param types the types
- * @return the string
- */
- private String printConfigTypes(
- HashMap<?, MtasParserType<MtasParserMapping<?>>> types) {
- StringBuilder text = new StringBuilder();
- for (Entry<?, MtasParserType<MtasParserMapping<?>>> entry : types
- .entrySet()) {
- text.append("- " + entry.getKey() + ": " + entry.getValue().items.size()
- + " mapping(s)\n");
- for (int i = 0; i < entry.getValue().items.size(); i++) {
- text.append("\t" + entry.getValue().items.get(i) + "\n");
- }
- }
- return text.toString();
- }
- /**
- * The Class MtasCRMAncestors.
- */
- private static class MtasCRMAncestors {
- /** The unknown. */
- public int unknown = 0;
- }
- /**
- * The Class MtasCRMParserFunction.
- */
- private static class MtasCRMParserFunction {
- /** The split. */
- public String split;
- /** The output. */
- public Map<String, ArrayList<MtasCRMParserFunctionOutput>> output;
- /**
- * Instantiates a new mtas CRM parser function.
- *
- * @param type the type
- * @param split the split
- */
- public MtasCRMParserFunction(String type, String split) {
- this.split = split;
- output = new HashMap<>();
- }
- }
- /**
- * The Class MtasCRMParserFunctionOutput.
- */
- private class MtasCRMParserFunctionOutput {
- /** The name. */
- public String name;
- /** The value. */
- public String value;
- /**
- * Instantiates a new mtas CRM parser function output.
- *
- * @param name the name
- * @param value the value
- */
- public MtasCRMParserFunctionOutput(String name, String value) {
- this.name = name;
- this.value = value;
- }
- /**
- * Creates the.
- *
- * @param originalValue the original value
- * @return the mtas CRM parser function output
- */
- public MtasCRMParserFunctionOutput create(String originalValue) {
- if (value != null) {
- return this;
- } else {
- return new MtasCRMParserFunctionOutput(name, originalValue);
- }
- }
- /*
- * (non-Javadoc)
- *
- * @see java.lang.Object#toString()
- */
- @Override
- public String toString() {
- return "MtasCRMParserFunctionOutput[" + name + "," + value + "]";
- }
- }
- /**
- * The Class MtasCRMParserMappingWordAnnotation.
- */
- private class MtasCRMParserMappingWordAnnotation
- extends MtasParserMapping<MtasCRMParserMappingWordAnnotation> {
- /**
- * Instantiates a new mtas CRM parser mapping word annotation.
- */
- public MtasCRMParserMappingWordAnnotation() {
- super();
- this.position = SOURCE_OWN;
- this.realOffset = SOURCE_OWN;
- this.offset = SOURCE_ANCESTOR_WORD;
- this.type = MAPPING_TYPE_WORD_ANNOTATION;
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasParser.MtasParserMapping#self()
- */
- @Override
- protected MtasCRMParserMappingWordAnnotation self() {
- return this;
- }
- }
- /**
- * The Class MtasCRMParserMappingCRMSentence.
- */
- private class MtasCRMParserMappingCRMSentence
- extends MtasParserMapping<MtasCRMParserMappingCRMSentence> {
- /**
- * Instantiates a new mtas CRM parser mapping CRM sentence.
- */
- public MtasCRMParserMappingCRMSentence() {
- super();
- this.position = SOURCE_OWN;
- this.realOffset = SOURCE_OWN;
- this.offset = SOURCE_OWN;
- this.type = MAPPING_TYPE_GROUP;
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
- */
- @Override
- protected MtasCRMParserMappingCRMSentence self() {
- return this;
- }
- }
- /**
- * The Class MtasCRMParserMappingCRMPair.
- */
- private class MtasCRMParserMappingCRMPair
- extends MtasParserMapping<MtasCRMParserMappingCRMPair> {
- /**
- * Instantiates a new mtas CRM parser mapping CRM pair.
- */
- public MtasCRMParserMappingCRMPair() {
- super();
- this.position = SOURCE_OWN;
- this.realOffset = SOURCE_OWN;
- this.offset = SOURCE_OWN;
- this.type = MAPPING_TYPE_RELATION;
- }
- /*
- * (non-Javadoc)
- *
- * @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
- */
- @Override
- protected MtasCRMParserMappingCRMPair self() {
- return this;
- }
- }
- }