MtasCRMParser.java

  1. package mtas.analysis.parser;

  2. import java.io.IOException;
  3. import java.io.Reader;
  4. import java.util.ArrayList;
  5. import java.util.Arrays;
  6. import java.util.Collection;
  7. import java.util.HashMap;
  8. import java.util.HashSet;
  9. import java.util.List;
  10. import java.util.Map;
  11. import java.util.Map.Entry;
  12. import java.util.Set;
  13. import java.util.concurrent.atomic.AtomicInteger;
  14. import java.util.regex.Matcher;
  15. import java.util.regex.Pattern;

  16. import org.apache.commons.logging.Log;
  17. import org.apache.commons.logging.LogFactory;

  18. import mtas.analysis.token.MtasTokenCollection;
  19. import mtas.analysis.token.MtasTokenIdFactory;
  20. import mtas.analysis.util.MtasBufferedReader;
  21. import mtas.analysis.util.MtasConfigException;
  22. import mtas.analysis.util.MtasConfiguration;
  23. import mtas.analysis.util.MtasParserException;

  24. /**
  25.  * The Class MtasCRMParser.
  26.  */

  27. public class MtasCRMParser extends MtasBasicParser {

  28.   /** The Constant log. */
  29.   private static final Log log = LogFactory.getLog(MtasCRMParser.class);

  30.   /** The word type. */
  31.   private MtasParserType<MtasParserMapping<?>> wordType = null;

  32.   /** The word annotation types. */
  33.   private HashMap<String, MtasParserType<MtasParserMapping<?>>> wordAnnotationTypes = new HashMap<>();

  34.   /** The crm sentence types. */
  35.   private HashMap<String, MtasParserType<MtasParserMapping<?>>> crmSentenceTypes = new HashMap<>();

  36.   /** The crm clause types. */
  37.   private HashMap<String, MtasParserType<MtasParserMapping<?>>> crmClauseTypes = new HashMap<>();

  38.   /** The crm pair types. */
  39.   private HashMap<String, MtasParserType<MtasParserMapping<?>>> crmPairTypes = new HashMap<>();

  40.   /** The functions. */
  41.   private HashMap<String, HashMap<String, MtasCRMParserFunction>> functions = new HashMap<>();

  42.   private HashMap<Integer, HashMap<String, String>> filterReplace = new HashMap<>();

  43.   /** The Constant MAPPING_TYPE_CRM_SENTENCE. */
  44.   protected static final String MAPPING_TYPE_CRM_SENTENCE = "crmSentence";

  45.   /** The Constant MAPPING_TYPE_CRM_CLAUSE. */
  46.   protected static final String MAPPING_TYPE_CRM_CLAUSE = "crmClause";

  47.   /** The Constant MAPPING_TYPE_CRM_PAIR. */
  48.   protected static final String MAPPING_TYPE_CRM_PAIR = "crmPair";

  49.   protected static final String FILTER_TYPE_REPLACE = "replace";

  50.  
  51.   /** The history pair. */
  52.   private HashMap<String, HashMap<String, MtasParserObject>> historyPair = new HashMap<>();

  53.   /** The pair pattern. */
  54.   Pattern pairPattern = Pattern.compile("^([b|e])([a-z])([0-9]+)$");

  55.   /**
  56.    * Instantiates a new mtas CRM parser.
  57.    *
  58.    * @param config the config
  59.    */
  60.   public MtasCRMParser(MtasConfiguration config) {
  61.     super(config);
  62.     try {
  63.       initParser();
  64.       // System.out.print(printConfig());
  65.     } catch (MtasConfigException e) {
  66.       log.error(e);
  67.     }
  68.   }

  69.   /*
  70.    * (non-Javadoc)
  71.    *
  72.    * @see mtas.analysis.parser.MtasParser#initParser()
  73.    */
  74.   @SuppressWarnings("unchecked")
  75.   @Override
  76.   protected void initParser() throws MtasConfigException {
  77.     super.initParser();
  78.     if (config != null) {
  79.       // always word, no mappings
  80.       wordType = new MtasParserType<>(MAPPING_TYPE_WORD, null, false);
  81.       for (int i = 0; i < config.children.size(); i++) {
  82.         MtasConfiguration current = config.children.get(i);
  83.         if (current.name.equals("filters")) {
  84.           for (int j = 0; j < current.children.size(); j++) {
  85.             if (current.children.get(j).name.equals("filter")) {
  86.               MtasConfiguration filter = current.children.get(j);
  87.               String typeFilter = filter.attributes.get("type");
  88.               String nameFilter = filter.attributes.get("name");
  89.               if(typeFilter!=null) {
  90.                 if(typeFilter.equals(FILTER_TYPE_REPLACE)) {
  91.                   String value = filter.attributes.get("value");
  92.                   String replace = filter.attributes.get("replace");
  93.                   if(nameFilter!=null && value!=null && replace!=null) {
  94.                     String[] names = nameFilter.split(Pattern.quote(","));
  95.                     for(String name : names) {
  96.                       try {
  97.                         int nameInt = Integer.parseInt(name);
  98.                         HashMap<String, String> nameMap;
  99.                         if(!filterReplace.containsKey(nameInt)) {
  100.                           nameMap = new HashMap<>();
  101.                           filterReplace.put(nameInt, nameMap);
  102.                         } else {
  103.                           nameMap = filterReplace.get(nameInt);
  104.                         }
  105.                         nameMap.put(value, replace);
  106.                       } catch (NumberFormatException e) {
  107.                         log.info(e);                        
  108.                       }                      
  109.                     }                    
  110.                   } else {
  111.                     throw new MtasConfigException("no name, value or replace for filter "
  112.                         + typeFilter );
  113.                   }
  114.                 } else {
  115.                   throw new MtasConfigException("unknown filter type "
  116.                       + typeFilter );
  117.                 }
  118.               } else {
  119.                 throw new MtasConfigException("no type provided for filter" );
  120.               }
  121.             }
  122.           }  
  123.         } else if (current.name.equals("mappings")) {
  124.           for (int j = 0; j < current.children.size(); j++) {
  125.             if (current.children.get(j).name.equals("mapping")) {
  126.               MtasConfiguration mapping = current.children.get(j);
  127.               String typeMapping = mapping.attributes.get("type");
  128.               String nameMapping = mapping.attributes.get("name");
  129.               if ((typeMapping != null)) {
  130.                 if (typeMapping.equals(MAPPING_TYPE_WORD)) {
  131.                   MtasCRMParserMappingWordAnnotation m = new MtasCRMParserMappingWordAnnotation();
  132.                   m.processConfig(mapping);
  133.                   wordType.addItem(m);
  134.                 } else if (typeMapping.equals(MAPPING_TYPE_WORD_ANNOTATION)
  135.                     && (nameMapping != null)) {
  136.                   MtasCRMParserMappingWordAnnotation m = new MtasCRMParserMappingWordAnnotation();
  137.                   m.processConfig(mapping);
  138.                   if (wordAnnotationTypes.containsKey(nameMapping)) {
  139.                     wordAnnotationTypes.get(nameMapping).addItem(m);
  140.                   } else {
  141.                     MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
  142.                         typeMapping, nameMapping, false);
  143.                     t.addItem(m);
  144.                     wordAnnotationTypes.put(nameMapping, t);
  145.                   }
  146.                 } else if (typeMapping.equals(MAPPING_TYPE_CRM_SENTENCE)) {
  147.                   MtasCRMParserMappingCRMSentence m = new MtasCRMParserMappingCRMSentence();
  148.                   m.processConfig(mapping);
  149.                   if (crmSentenceTypes.containsKey(nameMapping)) {
  150.                     crmSentenceTypes.get(nameMapping).addItem(m);
  151.                   } else {
  152.                     MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
  153.                         MAPPING_TYPE_GROUP, nameMapping, true);
  154.                     t.addItem(m);
  155.                     crmSentenceTypes.put(nameMapping, t);
  156.                   }
  157.                 } else if (typeMapping.equals(MAPPING_TYPE_CRM_CLAUSE)) {
  158.                   MtasCRMParserMappingCRMSentence m = new MtasCRMParserMappingCRMSentence();
  159.                   m.processConfig(mapping);
  160.                   if (crmClauseTypes.containsKey(nameMapping)) {
  161.                     crmClauseTypes.get(nameMapping).addItem(m);
  162.                   } else {
  163.                     MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
  164.                         MAPPING_TYPE_GROUP, nameMapping, true);
  165.                     t.addItem(m);
  166.                     crmClauseTypes.put(nameMapping, t);
  167.                   }
  168.                 } else if (typeMapping.equals(MAPPING_TYPE_CRM_PAIR)) {
  169.                   MtasCRMParserMappingCRMPair m = new MtasCRMParserMappingCRMPair();
  170.                   m.processConfig(mapping);
  171.                   if (crmPairTypes.containsKey(nameMapping)) {
  172.                     crmPairTypes.get(nameMapping).addItem(m);
  173.                   } else {
  174.                     MtasParserType<MtasParserMapping<?>> t = new MtasParserType<>(
  175.                         MAPPING_TYPE_RELATION, nameMapping, true);
  176.                     t.addItem(m);
  177.                     crmPairTypes.put(nameMapping, t);
  178.                   }
  179.                 } else {
  180.                   throw new MtasConfigException("unknown mapping type "
  181.                       + typeMapping + " or missing name");
  182.                 }
  183.               }
  184.             }
  185.           }
  186.         } else if (current.name.equals("functions")) {
  187.           for (int j = 0; j < current.children.size(); j++) {
  188.             if (current.children.get(j).name.equals("function")) {
  189.               MtasConfiguration function = current.children.get(j);
  190.               String nameFunction = function.attributes.get("name");
  191.               String typeFunction = function.attributes.get("type");
  192.               String splitFunction = function.attributes.get("split");
  193.               if (nameFunction != null && typeFunction != null) {
  194.                 MtasCRMParserFunction mtasCRMParserFunction = new MtasCRMParserFunction(
  195.                     typeFunction, splitFunction);
  196.                 if (!functions.containsKey(typeFunction)) {
  197.                   functions.put(typeFunction,
  198.                       new HashMap<String, MtasCRMParserFunction>());
  199.                 }
  200.                 functions.get(typeFunction).put(nameFunction,
  201.                     mtasCRMParserFunction);
  202.                 MtasConfiguration subCurrent = current.children.get(j);
  203.                 for (int k = 0; k < subCurrent.children.size(); k++) {
  204.                   if (subCurrent.children.get(k).name.equals("condition")) {
  205.                     MtasConfiguration subSubCurrent = subCurrent.children
  206.                         .get(k);
  207.                     if (subSubCurrent.attributes.containsKey("value")) {
  208.                       String[] valuesCondition = subSubCurrent.attributes
  209.                           .get("value").split(Pattern.quote(","));
  210.                       ArrayList<MtasCRMParserFunctionOutput> valueOutputList = new ArrayList<>();
  211.                       for (int l = 0; l < subSubCurrent.children.size(); l++) {
  212.                         if (subSubCurrent.children.get(l).name
  213.                             .equals("output")) {
  214.                           String valueOutput = subSubCurrent.children
  215.                               .get(l).attributes.get("value");
  216.                           String nameOutput = subSubCurrent.children
  217.                               .get(l).attributes.get("name");
  218.                           if (nameOutput != null) {
  219.                             MtasCRMParserFunctionOutput o = new MtasCRMParserFunctionOutput(
  220.                                 nameOutput, valueOutput);
  221.                             valueOutputList.add(o);
  222.                           }
  223.                         }
  224.                       }
  225.                       if (!valueOutputList.isEmpty()) {
  226.                         for (String valueCondition : valuesCondition) {
  227.                           if (mtasCRMParserFunction.output
  228.                               .containsKey(valueCondition)) {
  229.                             mtasCRMParserFunction.output.get(valueCondition)
  230.                                 .addAll(
  231.                                     (Collection<? extends MtasCRMParserFunctionOutput>) valueOutputList
  232.                                         .clone());
  233.                           } else {
  234.                             mtasCRMParserFunction.output.put(valueCondition,
  235.                                 (ArrayList<MtasCRMParserFunctionOutput>) valueOutputList
  236.                                     .clone());
  237.                           }
  238.                         }
  239.                       }
  240.                     }
  241.                   }
  242.                 }
  243.               }
  244.             }
  245.           }
  246.         }
  247.       }
  248.     }
  249.   }

  250.   /*
  251.    * (non-Javadoc)
  252.    *
  253.    * @see mtas.analysis.parser.MtasParser#createTokenCollection(java.io.Reader)
  254.    */
  255.   @Override
  256.   public MtasTokenCollection createTokenCollection(Reader reader)
  257.       throws MtasParserException, MtasConfigException {
  258.     AtomicInteger position = new AtomicInteger(0);
  259.     MtasCRMAncestors unknownAncestors = new MtasCRMAncestors();

  260.     Map<String, Set<Integer>> idPositions = new HashMap<>();
  261.     Map<String, Integer[]> idOffsets = new HashMap<>();

  262.     Map<String, Map<Integer, Set<String>>> updateList = createUpdateList();
  263.     Map<String, List<MtasParserObject>> currentList = createCurrentList();

  264.     tokenCollection = new MtasTokenCollection();
  265.     MtasTokenIdFactory mtasTokenIdFactory = new MtasTokenIdFactory();
  266.     try (MtasBufferedReader br = new MtasBufferedReader(reader)) {
  267.       String line;
  268.       int currentOffset;
  269.       int previousOffset = br.getPosition();
  270.       MtasParserObject currentObject;
  271.       Pattern headerPattern = Pattern.compile("^@ @ @(.*)$");
  272.       Pattern regularPattern = Pattern.compile(
  273.           "^([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+)$");
  274.       Matcher matcherHeader;
  275.       Matcher matcherRegular = null;
  276.       Set<MtasParserObject> newPreviousSentence = new HashSet<>();
  277.       Set<MtasParserObject> previousSentence = new HashSet<>();
  278.       Set<MtasParserObject> newPreviousClause = new HashSet<>();
  279.       Set<MtasParserObject> previousClause = new HashSet<>();
  280.       String[] matcherList = new String[8];
  281.       while ((line = br.readLine()) != null) {
  282.         currentOffset = br.getPosition();
  283.         matcherHeader = headerPattern.matcher(line.trim());
  284.         matcherRegular = regularPattern.matcher(line.trim());
  285.         if (matcherRegular.matches()) {
  286.           newPreviousSentence.clear();
  287.           matcherList = createMatcherList(matcherRegular);
  288.           for (int i = 4; i < 8; i++) {
  289.             List<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
  290.             Set<MtasParserObject> tmpList = processCRMSentence(
  291.                 mtasTokenIdFactory, String.valueOf(i),
  292.                 matcherList[i], currentOffset,
  293.                 functionOutputList, unknownAncestors, currentList, updateList,
  294.                 idPositions, idOffsets, previousSentence, previousClause);
  295.             if (tmpList != null) {
  296.               newPreviousSentence.addAll(tmpList);
  297.             }
  298.             for (MtasCRMParserFunctionOutput functionOutput : functionOutputList) {
  299.               tmpList = processCRMSentence(mtasTokenIdFactory,
  300.                   functionOutput.name, functionOutput.value, currentOffset,
  301.                   functionOutputList, unknownAncestors, currentList, updateList,
  302.                   idPositions, idOffsets, previousSentence, previousClause);
  303.               if (tmpList != null) {
  304.                 newPreviousSentence.addAll(tmpList);
  305.               }
  306.             }
  307.           }
  308.           if (!newPreviousSentence.isEmpty()) {
  309.             previousSentence.clear();
  310.             previousSentence.addAll(newPreviousSentence);
  311.           }
  312.           newPreviousClause.clear();
  313.           for (int i = 4; i < 8; i++) {
  314.             ArrayList<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
  315.             Set<MtasParserObject> tmpList = processCRMClause(mtasTokenIdFactory,
  316.                 String.valueOf(i), matcherList[i], currentOffset,
  317.                 functionOutputList, unknownAncestors, currentList, updateList,
  318.                 idPositions, idOffsets, previousClause);
  319.             if (tmpList != null) {
  320.               newPreviousClause.addAll(tmpList);
  321.             }
  322.             for (MtasCRMParserFunctionOutput functionOutput : functionOutputList) {
  323.               tmpList = processCRMClause(mtasTokenIdFactory,
  324.                   functionOutput.name, functionOutput.value, currentOffset,
  325.                   functionOutputList, unknownAncestors, currentList, updateList,
  326.                   idPositions, idOffsets, previousClause);
  327.               if (tmpList != null) {
  328.                 newPreviousClause.addAll(tmpList);
  329.               }
  330.             }
  331.           }
  332.           if (!newPreviousClause.isEmpty()) {
  333.             previousClause.clear();
  334.             previousClause.addAll(newPreviousClause);
  335.           }
  336.         }

  337.         if (matcherRegular.matches() && !matcherHeader.matches()) {
  338.           matcherRegular = regularPattern.matcher(line.trim());
  339.           if (matcherRegular.matches()) {
  340.             // regular line - start word
  341.             currentObject = new MtasParserObject(wordType);
  342.             currentObject.setOffsetStart(previousOffset);
  343.             currentObject.setRealOffsetStart(previousOffset);
  344.             currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
  345.             if (!prevalidateObject(currentObject, currentList)) {
  346.               unknownAncestors.unknown++;
  347.             } else {
  348.               int p = position.getAndIncrement();
  349.               currentObject.addPosition(p);
  350.               currentObject.objectId = "word_" + p;
  351.               currentList.get(MAPPING_TYPE_WORD).add(currentObject);
  352.               unknownAncestors.unknown = 0;
  353.               // check for crmPair
  354.               for (int i = 0; i < 8; i++) {
  355.                 List<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
  356.                 processCRMPair(mtasTokenIdFactory, p, String.valueOf(i),
  357.                     matcherList[i], currentOffset,
  358.                     functionOutputList, unknownAncestors, currentList,
  359.                     updateList, idPositions, idOffsets);
  360.                 for (MtasCRMParserFunctionOutput functionOutput : functionOutputList) {
  361.                   processCRMPair(mtasTokenIdFactory, p, functionOutput.name,
  362.                       functionOutput.value, currentOffset, functionOutputList,
  363.                       unknownAncestors, currentList, updateList, idPositions,
  364.                       idOffsets);
  365.                 }
  366.               }
  367.               // compute word annotations
  368.               for (int i = 0; i < 8; i++) {
  369.                 ArrayList<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
  370.                 functionOutputList
  371.                     .addAll(processWordAnnotation(mtasTokenIdFactory,
  372.                         String.valueOf(i), matcherList[i],
  373.                         previousOffset, currentOffset, unknownAncestors,
  374.                         currentList, updateList, idPositions, idOffsets));
  375.                 for (MtasCRMParserFunctionOutput functionOutput : functionOutputList) {
  376.                   processWordAnnotation(mtasTokenIdFactory, functionOutput.name,
  377.                       functionOutput.value, previousOffset, currentOffset,
  378.                       unknownAncestors, currentList, updateList, idPositions,
  379.                       idOffsets);
  380.                 }
  381.               }
  382.             }
  383.             // finish word
  384.             if (unknownAncestors.unknown > 0) {
  385.               unknownAncestors.unknown--;
  386.             } else {
  387.               currentObject = currentList.get(MAPPING_TYPE_WORD)
  388.                   .remove(currentList.get(MAPPING_TYPE_WORD).size() - 1);
  389.               assert unknownAncestors.unknown == 0 : "error in administration "
  390.                   + currentObject.getType().getName();
  391.               currentObject.setText(null);
  392.               currentObject.setOffsetEnd(currentOffset - 1);
  393.               currentObject.setRealOffsetEnd(currentOffset - 1);
  394.               // update ancestor groups with position and offset
  395.               for (MtasParserObject currentGroup : currentList
  396.                   .get(MAPPING_TYPE_GROUP)) {
  397.                 currentGroup.addPositions(currentObject.getPositions());
  398.                 currentGroup.addOffsetStart(currentObject.getOffsetStart());
  399.                 currentGroup.addOffsetEnd(currentObject.getOffsetEnd());
  400.               }
  401.               idPositions.put(currentObject.getId(),
  402.                   currentObject.getPositions());
  403.               idOffsets.put(currentObject.getId(), currentObject.getOffset());
  404.               currentObject.updateMappings(idPositions, idOffsets);
  405.               unknownAncestors.unknown = currentObject
  406.                   .getUnknownAncestorNumber();
  407.               computeMappingsFromObject(mtasTokenIdFactory, currentObject,
  408.                   currentList, updateList);
  409.             }

  410.           } else {
  411.             // System.out.println("PROBLEM: " + line);
  412.           }
  413.         }
  414.         previousOffset = br.getPosition();
  415.       }
  416.       closePrevious(mtasTokenIdFactory, previousSentence, previousOffset,
  417.           unknownAncestors, currentList, updateList, idPositions, idOffsets);
  418.       closePrevious(mtasTokenIdFactory, previousClause, previousOffset,
  419.           unknownAncestors, currentList, updateList, idPositions, idOffsets);
  420.     } catch (IOException e) {
  421.       log.debug(e);
  422.       throw new MtasParserException(e.getMessage());
  423.     }
  424.     // final check
  425.     tokenCollection.check(autorepair, makeunique);
  426.     return tokenCollection;

  427.   }

  428.   private String[] createMatcherList(Matcher matcher) {
  429.     String[] list = new String[8];
  430.     String value;
  431.     for(int i=0; i<8; i++) {
  432.       value = matcher.group((i+1));
  433.       if(filterReplace.containsKey(i)) {
  434.         for(Entry<String,String> entry : filterReplace.get(i).entrySet()) {
  435.           value = value.replaceAll(Pattern.quote(entry.getKey()), entry.getValue());
  436.         }
  437.       }
  438.       list[i] = value;
  439.     }
  440.     return list;
  441.   }
  442.  
  443.   /**
  444.    * Process word annotation.
  445.    *
  446.    * @param mtasTokenIdFactory the mtas token id factory
  447.    * @param name the name
  448.    * @param text the text
  449.    * @param previousOffset the previous offset
  450.    * @param currentOffset the current offset
  451.    * @param unknownAncestors the unknown ancestors
  452.    * @param currentList the current list
  453.    * @param updateList the update list
  454.    * @param idPositions the id positions
  455.    * @param idOffsets the id offsets
  456.    * @return the list
  457.    * @throws MtasParserException the mtas parser exception
  458.    * @throws MtasConfigException the mtas config exception
  459.    */
  460.   private List<MtasCRMParserFunctionOutput> processWordAnnotation(
  461.       MtasTokenIdFactory mtasTokenIdFactory, String name, String text,
  462.       Integer previousOffset, Integer currentOffset,
  463.       MtasCRMAncestors unknownAncestors,
  464.       Map<String, List<MtasParserObject>> currentList,
  465.       Map<String, Map<Integer, Set<String>>> updateList,
  466.       Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets)
  467.       throws MtasParserException, MtasConfigException {
  468.     MtasParserType tmpCurrentType;
  469.     MtasParserObject currentObject;
  470.     List<MtasCRMParserFunctionOutput> functionOutputList = new ArrayList<>();
  471.     if ((tmpCurrentType = wordAnnotationTypes.get(name)) != null) {
  472.       // start word annotation
  473.       currentObject = new MtasParserObject(tmpCurrentType);
  474.       currentObject.setRealOffsetStart(previousOffset);
  475.       currentObject.addPositions(currentList.get(MAPPING_TYPE_WORD)
  476.           .get((currentList.get(MAPPING_TYPE_WORD).size() - 1)).getPositions());
  477.       currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
  478.       if (!prevalidateObject(currentObject, currentList)) {
  479.         unknownAncestors.unknown++;
  480.       } else {
  481.         currentList.get(MAPPING_TYPE_WORD_ANNOTATION).add(currentObject);
  482.         unknownAncestors.unknown = 0;
  483.       }
  484.       // finish word annotation
  485.       if (unknownAncestors.unknown > 0) {
  486.         unknownAncestors.unknown--;
  487.       } else {
  488.         currentObject = currentList.get(MAPPING_TYPE_WORD_ANNOTATION)
  489.             .remove(currentList.get(MAPPING_TYPE_WORD_ANNOTATION).size() - 1);
  490.         assert unknownAncestors.unknown == 0 : "error in administration "
  491.             + currentObject.getType().getName();
  492.         if (functions.containsKey(MAPPING_TYPE_WORD_ANNOTATION)
  493.             && functions.get(MAPPING_TYPE_WORD_ANNOTATION).containsKey(name)
  494.             && text != null) {
  495.           MtasCRMParserFunction function = functions
  496.               .get(MAPPING_TYPE_WORD_ANNOTATION).get(name);
  497.           String[] value;
  498.           if (function.split != null) {
  499.             value = text.split(Pattern.quote(function.split));
  500.           } else {
  501.             value = new String[] { text };
  502.           }
  503.           for (int c = 0; c < value.length; c++) {
  504.             if (function.output.containsKey(value[c])) {
  505.               functionOutputList.addAll(function.output.get(value[c]));
  506.             }
  507.           }
  508.         }
  509.         currentObject.setText(text);
  510.         currentObject.setRealOffsetEnd(currentOffset - 1);
  511.         idPositions.put(currentObject.getId(), currentObject.getPositions());
  512.         idOffsets.put(currentObject.getId(), currentObject.getOffset());
  513.         // offset always null, so update later with word (should be possible)
  514.         if ((currentObject.getId() != null)
  515.             && (!currentList.get(MAPPING_TYPE_WORD).isEmpty())) {
  516.           currentList.get(MAPPING_TYPE_WORD)
  517.               .get((currentList.get(MAPPING_TYPE_WORD).size() - 1))
  518.               .addUpdateableIdWithOffset(currentObject.getId());
  519.         }
  520.         currentObject.updateMappings(idPositions, idOffsets);
  521.         unknownAncestors.unknown = currentObject.getUnknownAncestorNumber();
  522.         computeMappingsFromObject(mtasTokenIdFactory, currentObject,
  523.             currentList, updateList);
  524.       }
  525.     }
  526.     return functionOutputList;
  527.   }

  528.   /**
  529.    * Process CRM sentence.
  530.    *
  531.    * @param mtasTokenIdFactory the mtas token id factory
  532.    * @param name the name
  533.    * @param text the text
  534.    * @param currentOffset the current offset
  535.    * @param functionOutputList the function output list
  536.    * @param unknownAncestors the unknown ancestors
  537.    * @param currentList the current list
  538.    * @param updateList the update list
  539.    * @param idPositions the id positions
  540.    * @param idOffsets the id offsets
  541.    * @param previous the previous
  542.    * @param previousClause the previous clause
  543.    * @return the sets the
  544.    * @throws MtasParserException the mtas parser exception
  545.    * @throws MtasConfigException the mtas config exception
  546.    */
  547.   private Set<MtasParserObject> processCRMSentence(
  548.       MtasTokenIdFactory mtasTokenIdFactory, String name, String text,
  549.       Integer currentOffset,
  550.       List<MtasCRMParserFunctionOutput> functionOutputList,
  551.       MtasCRMAncestors unknownAncestors,
  552.       Map<String, List<MtasParserObject>> currentList,
  553.       Map<String, Map<Integer, Set<String>>> updateList,
  554.       Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets,
  555.       Set<MtasParserObject> previous, Set<MtasParserObject> previousClause)
  556.       throws MtasParserException, MtasConfigException {
  557.     MtasParserType tmpCurrentType;
  558.     MtasParserObject currentObject;
  559.     if ((tmpCurrentType = crmSentenceTypes.get(name)) != null) {
  560.       String filteredText = text.replaceAll("[^0-9\\-]", "");
  561.       currentObject = new MtasParserObject(tmpCurrentType);
  562.       currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
  563.       currentObject.setRealOffsetStart(currentOffset);
  564.       currentObject.setText(filteredText);
  565.       if (!prevalidateObject(currentObject, currentList)) {
  566.         return new HashSet<>();
  567.       } else {
  568.         closePrevious(mtasTokenIdFactory, previousClause, currentOffset,
  569.             unknownAncestors, currentList, updateList, idPositions, idOffsets);
  570.         closePrevious(mtasTokenIdFactory, previous, currentOffset,
  571.             unknownAncestors, currentList, updateList, idPositions, idOffsets);
  572.         previous.clear();
  573.         currentList.get(MAPPING_TYPE_GROUP).add(currentObject);
  574.         unknownAncestors.unknown = 0;
  575.         return new HashSet<>(Arrays.asList(currentObject));
  576.       }
  577.     }
  578.     return new HashSet<>();
  579.   }

  580.   /**
  581.    * Process CRM clause.
  582.    *
  583.    * @param mtasTokenIdFactory the mtas token id factory
  584.    * @param name the name
  585.    * @param text the text
  586.    * @param currentOffset the current offset
  587.    * @param functionOutputList the function output list
  588.    * @param unknownAncestors the unknown ancestors
  589.    * @param currentList the current list
  590.    * @param updateList the update list
  591.    * @param idPositions the id positions
  592.    * @param idOffsets the id offsets
  593.    * @param previous the previous
  594.    * @return the sets the
  595.    * @throws MtasParserException the mtas parser exception
  596.    * @throws MtasConfigException the mtas config exception
  597.    */
  598.   private Set<MtasParserObject> processCRMClause(
  599.       MtasTokenIdFactory mtasTokenIdFactory, String name, String text,
  600.       Integer currentOffset,
  601.       List<MtasCRMParserFunctionOutput> functionOutputList,
  602.       MtasCRMAncestors unknownAncestors,
  603.       Map<String, List<MtasParserObject>> currentList,
  604.       Map<String, Map<Integer, Set<String>>> updateList,
  605.       Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets,
  606.       Set<MtasParserObject> previous)
  607.       throws MtasParserException, MtasConfigException {
  608.     MtasParserType tmpCurrentType;
  609.     MtasParserObject currentObject;
  610.     if ((tmpCurrentType = crmClauseTypes.get(name)) != null) {
  611.       String filteredText = text.replaceAll("[^0-9\\-]", "");
  612.       currentObject = new MtasParserObject(tmpCurrentType);
  613.       currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
  614.       currentObject.setRealOffsetStart(currentOffset);
  615.       currentObject.setText(filteredText);
  616.       if (!prevalidateObject(currentObject, currentList)) {
  617.         return new HashSet<>();
  618.       } else {
  619.         closePrevious(mtasTokenIdFactory, previous, currentOffset,
  620.             unknownAncestors, currentList, updateList, idPositions, idOffsets);
  621.         previous.clear();
  622.         currentList.get(MAPPING_TYPE_GROUP).add(currentObject);
  623.         unknownAncestors.unknown = 0;
  624.         return new HashSet<>(Arrays.asList(currentObject));
  625.       }
  626.     }
  627.     return new HashSet<>();
  628.   }

  629.   /**
  630.    * Close previous.
  631.    *
  632.    * @param mtasTokenIdFactory the mtas token id factory
  633.    * @param previous the previous
  634.    * @param currentOffset the current offset
  635.    * @param unknownAncestors the unknown ancestors
  636.    * @param currentList the current list
  637.    * @param updateList the update list
  638.    * @param idPositions the id positions
  639.    * @param idOffsets the id offsets
  640.    * @throws MtasParserException the mtas parser exception
  641.    * @throws MtasConfigException the mtas config exception
  642.    */
  643.   private void closePrevious(MtasTokenIdFactory mtasTokenIdFactory,
  644.       Set<MtasParserObject> previous, Integer currentOffset,
  645.       MtasCRMAncestors unknownAncestors,
  646.       Map<String, List<MtasParserObject>> currentList,
  647.       Map<String, Map<Integer, Set<String>>> updateList,
  648.       Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets)
  649.       throws MtasParserException, MtasConfigException {
  650.     for (MtasParserObject previousObject : previous) {
  651.       previousObject.setRealOffsetEnd(currentOffset);
  652.       idPositions.put(previousObject.getId(), previousObject.getPositions());
  653.       idOffsets.put(previousObject.getId(), previousObject.getOffset());
  654.       previousObject.updateMappings(idPositions, idOffsets);
  655.       unknownAncestors.unknown = previousObject.getUnknownAncestorNumber();
  656.       computeMappingsFromObject(mtasTokenIdFactory, previousObject, currentList,
  657.           updateList);
  658.       currentList.get(MAPPING_TYPE_GROUP).remove(previousObject);
  659.     }
  660.   }

  661.   /**
  662.    * Process CRM pair.
  663.    *
  664.    * @param mtasTokenIdFactory the mtas token id factory
  665.    * @param position the position
  666.    * @param name the name
  667.    * @param text the text
  668.    * @param currentOffset the current offset
  669.    * @param functionOutputList the function output list
  670.    * @param unknownAncestors the unknown ancestors
  671.    * @param currentList the current list
  672.    * @param updateList the update list
  673.    * @param idPositions the id positions
  674.    * @param idOffsets the id offsets
  675.    * @throws MtasParserException the mtas parser exception
  676.    * @throws MtasConfigException the mtas config exception
  677.    */
  678.   private void processCRMPair(MtasTokenIdFactory mtasTokenIdFactory,
  679.       int position, String name, String text, Integer currentOffset,
  680.       List<MtasCRMParserFunctionOutput> functionOutputList,
  681.       MtasCRMAncestors unknownAncestors,
  682.       Map<String, List<MtasParserObject>> currentList,
  683.       Map<String, Map<Integer, Set<String>>> updateList,
  684.       Map<String, Set<Integer>> idPositions, Map<String, Integer[]> idOffsets)
  685.       throws MtasParserException, MtasConfigException {

  686.     MtasParserType tmpCurrentType;
  687.     MtasParserObject currentObject;

  688.     if ((tmpCurrentType = crmPairTypes.get(name)) != null) {
  689.       // get history
  690.       HashMap<String, MtasParserObject> currentNamePairHistory;
  691.       if (!historyPair.containsKey(name)) {
  692.         currentNamePairHistory = new HashMap<>();
  693.         historyPair.put(name, currentNamePairHistory);
  694.       } else {
  695.         currentNamePairHistory = historyPair.get(name);
  696.       }
  697.       Matcher m = pairPattern.matcher(text);
  698.       if (m.find()) {
  699.         String thisKey = m.group(1) + m.group(2);
  700.         String otherKey = (m.group(1).equals("b") ? "e" : "b") + m.group(2);
  701.         if (currentNamePairHistory.containsKey(otherKey)) {
  702.           currentObject = currentNamePairHistory.remove(otherKey);
  703.           currentObject.setText(currentObject.getText() + "+" + text);
  704.           currentObject.addPosition(position);
  705.           processFunctions(name, text, MAPPING_TYPE_CRM_PAIR,
  706.               functionOutputList);
  707.           currentObject.setRealOffsetEnd(currentOffset + 1);
  708.           currentObject.setOffsetEnd(currentOffset + 1);
  709.           idPositions.put(currentObject.getId(), currentObject.getPositions());
  710.           idOffsets.put(currentObject.getId(), currentObject.getOffset());
  711.           currentObject.updateMappings(idPositions, idOffsets);
  712.           unknownAncestors.unknown = currentObject.getUnknownAncestorNumber();
  713.           computeMappingsFromObject(mtasTokenIdFactory, currentObject,
  714.               currentList, updateList);
  715.         } else {
  716.           currentObject = new MtasParserObject(tmpCurrentType);
  717.           currentObject.setUnknownAncestorNumber(unknownAncestors.unknown);
  718.           currentObject.setRealOffsetStart(currentOffset);
  719.           currentObject.setOffsetStart(currentOffset);
  720.           currentObject.setText(text);
  721.           currentObject.addPosition(position);
  722.           if (!prevalidateObject(currentObject, currentList)) {
  723.             unknownAncestors.unknown++;
  724.           } else {
  725.             currentNamePairHistory.put(thisKey, currentObject);
  726.             processFunctions(name, text, MAPPING_TYPE_CRM_PAIR,
  727.                 functionOutputList);
  728.             currentObject.setRealOffsetEnd(currentOffset + 1);
  729.             currentObject.setOffsetEnd(currentOffset + 1);
  730.             idPositions.put(currentObject.getId(),
  731.                 currentObject.getPositions());
  732.             idOffsets.put(currentObject.getId(), currentObject.getOffset());
  733.             // offset always null, so update later with word (should be
  734.             // possible)
  735.             if ((currentObject.getId() != null)
  736.                 && (!currentList.get(MAPPING_TYPE_WORD).isEmpty())) {
  737.               currentList.get(MAPPING_TYPE_WORD)
  738.                   .get((currentList.get(MAPPING_TYPE_WORD).size() - 1))
  739.                   .addUpdateableIdWithOffset(currentObject.getId());
  740.             }

  741.           }
  742.         }
  743.       }

  744.     }

  745.   }

  746.   /**
  747.    * Process functions.
  748.    *
  749.    * @param name the name
  750.    * @param text the text
  751.    * @param type the type
  752.    * @param functionOutputList the function output list
  753.    */
  754.   private void processFunctions(String name, String text, String type,
  755.       List<MtasCRMParserFunctionOutput> functionOutputList) {
  756.     if (functions.containsKey(type) && functions.get(type).containsKey(name)
  757.         && text != null) {
  758.       MtasCRMParserFunction function = functions.get(type).get(name);
  759.       String[] value;
  760.       if (function.split != null) {
  761.         value = text.split(Pattern.quote(function.split));
  762.       } else {
  763.         value = new String[] { text };
  764.       }
  765.       for (int c = 0; c < value.length; c++) {
  766.         boolean checkedEmpty = false;
  767.         if (value[c].equals("")) {
  768.           checkedEmpty = true;
  769.         }
  770.         if (function.output.containsKey(value[c])) {
  771.           ArrayList<MtasCRMParserFunctionOutput> list = function.output
  772.               .get(value[c]);
  773.           for (MtasCRMParserFunctionOutput listItem : list) {
  774.             functionOutputList.add(listItem.create(value[c]));
  775.           }
  776.         }
  777.         if (!checkedEmpty && function.output.containsKey("")) {
  778.           ArrayList<MtasCRMParserFunctionOutput> list = function.output.get("");
  779.           for (MtasCRMParserFunctionOutput listItem : list) {
  780.             functionOutputList.add(listItem.create(value[c]));
  781.           }
  782.         }
  783.       }
  784.     }
  785.   }

  786.   /*
  787.    * (non-Javadoc)
  788.    *
  789.    * @see mtas.analysis.parser.MtasParser#printConfig()
  790.    */
  791.   @Override
  792.   public String printConfig() {
  793.     StringBuilder text = new StringBuilder();
  794.     text.append("=== CONFIGURATION ===\n");
  795.     text.append("type: " + wordAnnotationTypes.size() + " x wordAnnotation");
  796.     text.append(printConfigTypes(wordAnnotationTypes));
  797.     text.append("=== CONFIGURATION ===\n");
  798.     return text.toString();
  799.   }

  800.   /**
  801.    * Prints the config types.
  802.    *
  803.    * @param types the types
  804.    * @return the string
  805.    */
  806.   private String printConfigTypes(
  807.       HashMap<?, MtasParserType<MtasParserMapping<?>>> types) {
  808.     StringBuilder text = new StringBuilder();
  809.     for (Entry<?, MtasParserType<MtasParserMapping<?>>> entry : types
  810.         .entrySet()) {
  811.       text.append("- " + entry.getKey() + ": " + entry.getValue().items.size()
  812.           + " mapping(s)\n");
  813.       for (int i = 0; i < entry.getValue().items.size(); i++) {
  814.         text.append("\t" + entry.getValue().items.get(i) + "\n");
  815.       }
  816.     }
  817.     return text.toString();
  818.   }

  819.   /**
  820.    * The Class MtasCRMAncestors.
  821.    */
  822.   private static class MtasCRMAncestors {

  823.     /** The unknown. */
  824.     public int unknown = 0;
  825.   }

  826.   /**
  827.    * The Class MtasCRMParserFunction.
  828.    */
  829.   private static class MtasCRMParserFunction {

  830.     /** The split. */
  831.     public String split;

  832.     /** The output. */
  833.     public Map<String, ArrayList<MtasCRMParserFunctionOutput>> output;

  834.     /**
  835.      * Instantiates a new mtas CRM parser function.
  836.      *
  837.      * @param type the type
  838.      * @param split the split
  839.      */
  840.     public MtasCRMParserFunction(String type, String split) {
  841.       this.split = split;
  842.       output = new HashMap<>();
  843.     }

  844.   }

  845.   /**
  846.    * The Class MtasCRMParserFunctionOutput.
  847.    */
  848.   private class MtasCRMParserFunctionOutput {

  849.     /** The name. */
  850.     public String name;

  851.     /** The value. */
  852.     public String value;

  853.     /**
  854.      * Instantiates a new mtas CRM parser function output.
  855.      *
  856.      * @param name the name
  857.      * @param value the value
  858.      */
  859.     public MtasCRMParserFunctionOutput(String name, String value) {
  860.       this.name = name;
  861.       this.value = value;
  862.     }

  863.     /**
  864.      * Creates the.
  865.      *
  866.      * @param originalValue the original value
  867.      * @return the mtas CRM parser function output
  868.      */
  869.     public MtasCRMParserFunctionOutput create(String originalValue) {
  870.       if (value != null) {
  871.         return this;
  872.       } else {
  873.         return new MtasCRMParserFunctionOutput(name, originalValue);
  874.       }
  875.     }

  876.     /*
  877.      * (non-Javadoc)
  878.      *
  879.      * @see java.lang.Object#toString()
  880.      */
  881.     @Override
  882.     public String toString() {
  883.       return "MtasCRMParserFunctionOutput[" + name + "," + value + "]";
  884.     }
  885.   }

  886.   /**
  887.    * The Class MtasCRMParserMappingWordAnnotation.
  888.    */
  889.   private class MtasCRMParserMappingWordAnnotation
  890.       extends MtasParserMapping<MtasCRMParserMappingWordAnnotation> {

  891.     /**
  892.      * Instantiates a new mtas CRM parser mapping word annotation.
  893.      */
  894.     public MtasCRMParserMappingWordAnnotation() {
  895.       super();
  896.       this.position = SOURCE_OWN;
  897.       this.realOffset = SOURCE_OWN;
  898.       this.offset = SOURCE_ANCESTOR_WORD;
  899.       this.type = MAPPING_TYPE_WORD_ANNOTATION;
  900.     }

  901.     /*
  902.      * (non-Javadoc)
  903.      *
  904.      * @see mtas.analysis.parser.MtasParser.MtasParserMapping#self()
  905.      */
  906.     @Override
  907.     protected MtasCRMParserMappingWordAnnotation self() {
  908.       return this;
  909.     }
  910.   }

  911.   /**
  912.    * The Class MtasCRMParserMappingCRMSentence.
  913.    */
  914.   private class MtasCRMParserMappingCRMSentence
  915.       extends MtasParserMapping<MtasCRMParserMappingCRMSentence> {

  916.     /**
  917.      * Instantiates a new mtas CRM parser mapping CRM sentence.
  918.      */
  919.     public MtasCRMParserMappingCRMSentence() {
  920.       super();
  921.       this.position = SOURCE_OWN;
  922.       this.realOffset = SOURCE_OWN;
  923.       this.offset = SOURCE_OWN;
  924.       this.type = MAPPING_TYPE_GROUP;
  925.     }

  926.     /*
  927.      * (non-Javadoc)
  928.      *
  929.      * @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
  930.      */
  931.     @Override
  932.     protected MtasCRMParserMappingCRMSentence self() {
  933.       return this;
  934.     }
  935.   }

  936.   /**
  937.    * The Class MtasCRMParserMappingCRMPair.
  938.    */
  939.   private class MtasCRMParserMappingCRMPair
  940.       extends MtasParserMapping<MtasCRMParserMappingCRMPair> {

  941.     /**
  942.      * Instantiates a new mtas CRM parser mapping CRM pair.
  943.      */
  944.     public MtasCRMParserMappingCRMPair() {
  945.       super();
  946.       this.position = SOURCE_OWN;
  947.       this.realOffset = SOURCE_OWN;
  948.       this.offset = SOURCE_OWN;
  949.       this.type = MAPPING_TYPE_RELATION;
  950.     }

  951.     /*
  952.      * (non-Javadoc)
  953.      *
  954.      * @see mtas.analysis.parser.MtasBasicParser.MtasParserMapping#self()
  955.      */
  956.     @Override
  957.     protected MtasCRMParserMappingCRMPair self() {
  958.       return this;
  959.     }
  960.   }

  961. }