MtasPennTreebankReader.java
package mtas.analysis.util;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.regex.Pattern;
public class MtasPennTreebankReader {
private Reader reader;
private int bufferPosition;
private int readPosition;
private boolean eof;
private char nextBracket;
private String[] nextStrings;
private int counterBracket;
private int currentEvent;
private int currentPosition;
private int currentStringPosition;
private static final int EVENT_INIT = 0;
public static final int EVENT_STARTBRACKET = 1;
public static final int EVENT_ENDBRACKET = 2;
public static final int EVENT_NODE = 3;
public static final int EVENT_STRING = 4;
public static final int EVENT_EOF = 5;
private static final char CHARACTER_BRACKETSTART = '(';
private static final char CHARACTER_BRACKETEND = ')';
private static final char CHARACTER_CURLYBRACKETSTART = '{';
private static final char CHARACTER_CURLYBRACKETEND = '}';
private static final int DEFAULTCHARBUFFERSIZE = 8192;
private int charBufferSize;
private char[] charBuffer;
public MtasPennTreebankReader(Reader reader) throws IOException {
this.reader = reader;
nextBracket = 0;
nextStrings = null;
counterBracket = 0;
readPosition = -1;
bufferPosition = 0;
currentEvent = EVENT_INIT;
charBuffer = null;
charBufferSize = 0;
eof = false;
next();
}
public int getEventType() {
return currentEvent;
}
public int getPosition() {
return currentPosition;
}
public String getString() throws IOException {
if (currentEvent == EVENT_NODE || currentEvent == EVENT_STRING) {
return nextStrings[currentStringPosition];
} else {
throw new IOException("unexpected state");
}
}
public boolean next() throws IOException {
if (currentEvent == EVENT_EOF) {
return false;
} else if (currentEvent == EVENT_INIT) {
if (findBracket()) {
currentPosition = 0;
currentStringPosition = -1;
currentEvent = EVENT_STRING;
if (findNextString()) {
throw new IOException("string without opening bracket");
} else if (nextBracket == CHARACTER_BRACKETSTART) {
currentEvent = EVENT_STARTBRACKET;
currentPosition = readPosition;
return true;
} else {
throw new IOException("unexpected state");
}
} else {
currentEvent = EVENT_EOF;
return false;
}
} else if (currentEvent == EVENT_NODE || currentEvent == EVENT_STRING) {
currentEvent = EVENT_STRING;
if (findNextString()) {
return true;
} else if (nextBracket == CHARACTER_BRACKETSTART) {
currentEvent = EVENT_STARTBRACKET;
currentPosition = readPosition;
return true;
} else if (nextBracket == CHARACTER_BRACKETEND) {
currentEvent = EVENT_ENDBRACKET;
currentPosition = readPosition;
return true;
} else {
throw new IOException("unexpected state");
}
} else if (currentEvent == EVENT_STARTBRACKET
|| currentEvent == EVENT_ENDBRACKET) {
if (findBracket()) {
if (currentEvent == EVENT_STARTBRACKET) {
currentEvent = EVENT_NODE;
} else {
currentEvent = EVENT_STRING;
}
currentStringPosition = -1;
if (findNextString()) {
return true;
} else if (nextBracket == CHARACTER_BRACKETSTART) {
currentEvent = EVENT_STARTBRACKET;
currentPosition = readPosition;
return true;
} else if (nextBracket == CHARACTER_BRACKETEND) {
currentEvent = EVENT_ENDBRACKET;
currentPosition = readPosition;
return true;
} else {
throw new IOException("unexpected state");
}
} else {
currentEvent = EVENT_EOF;
return false;
}
} else {
throw new IOException("unexpected state");
}
}
private boolean findNextString() throws IOException {
if (currentEvent == EVENT_NODE || currentEvent == EVENT_STRING) {
while (currentStringPosition < (nextStrings.length - 1)) {
currentStringPosition++;
currentPosition++;
if (currentStringPosition > 0) {
currentPosition += nextStrings[currentStringPosition - 1].length();
}
if (!nextStrings[currentStringPosition].trim().isEmpty()) {
return true;
}
}
if (currentStringPosition > 0) {
currentPosition += 1 + nextStrings[currentStringPosition - 1].length();
}
return false;
} else {
throw new IOException("unexpected state");
}
}
private boolean findBracket() throws IOException {
StringBuilder sBuilder = new StringBuilder("");
while (!eof) {
char c = nextCharacter();
if (c == CHARACTER_BRACKETSTART) {
counterBracket++;
nextStrings = createStrings(sBuilder.toString());
nextBracket = c;
return true;
} else if (c == CHARACTER_BRACKETEND) {
counterBracket--;
nextStrings = createStrings(sBuilder.toString());
if (counterBracket < 0) {
throw new IOException("bracket mismatch");
} else if (nextBracket == CHARACTER_BRACKETSTART
&& nextStrings.length == 0) {
throw new IOException("empty brackets");
}
nextBracket = c;
return true;
} else {
sBuilder.append(c);
}
}
if (eof) {
if(!sBuilder.toString().trim().isEmpty()) {
throw new IOException("string without closing bracket");
} else if(counterBracket!=0) {
throw new IOException("unclosed bracket(s)");
}
}
if (counterBracket != 0) {
throw new IOException("bracket mismatch");
} else {
return false;
}
}
private char nextCharacter() throws IOException {
if(charBuffer==null) {
charBuffer = new char[DEFAULTCHARBUFFERSIZE];
charBufferSize = reader.read(charBuffer, 0, charBuffer.length);
eof = (charBufferSize > 0) ? false : true;
}
if (bufferPosition < charBufferSize) {
readPosition++;
char c = charBuffer[bufferPosition];
bufferPosition++;
// refill buffer if needed
if (bufferPosition == charBufferSize) {
bufferPosition = 0;
charBufferSize = reader.read(charBuffer, 0, charBuffer.length);
if (charBufferSize <= 0) {
eof = true;
}
}
return c;
} else {
throw new IOException("no (more) characters in reader");
}
}
public static String[] createStrings(String s) throws IOException {
return createStrings(s, null);
}
public static String[] createStrings(String s, String splitRegexp) throws IOException {
final String pattern = Pattern.quote(Character.toString(CHARACTER_CURLYBRACKETSTART))+".*"+Pattern.quote(Character.toString(CHARACTER_CURLYBRACKETEND));
splitRegexp = (splitRegexp==null) ? "\\s" : splitRegexp;
String[] initialList = s.split(splitRegexp, -1);
if(s.indexOf(CHARACTER_CURLYBRACKETSTART)==-1) {
return initialList;
} else {
String[] finalList = new String[initialList.length];
int j = 0;
boolean withinCurlyBracket = false;
for(int i=0; i<initialList.length; i++) {
if(withinCurlyBracket) {
finalList[j-1] = finalList[j-1] + " " + initialList[i];
if(initialList[i].indexOf(CHARACTER_CURLYBRACKETEND)!=-1) {
String item = CHARACTER_CURLYBRACKETSTART+initialList[i];
if(item.replaceAll(pattern, "").indexOf(CHARACTER_CURLYBRACKETSTART)==-1) {
withinCurlyBracket = false;
}
}
} else {
finalList[j] = initialList[i];
j++;
if(initialList[i].indexOf(CHARACTER_CURLYBRACKETSTART)!=-1) {
String item = initialList[i];
if(item.replaceAll(pattern, "").indexOf(CHARACTER_CURLYBRACKETSTART)!=-1) {
withinCurlyBracket = true;
}
}
}
}
if(withinCurlyBracket) {
throw new IOException("unclosed curly bracket for "+s);
}
return Arrays.copyOf(finalList, j);
}
}
}