Some code snippets to illustrate the use of Mtas directly with Apache Lucene.
Create index
Create an index with three folia files using configuration file folia.xml
String configFile = "folia.xml";
HashMap<String,String> files = new HashMap<String,String>();
files.put("title 1","resource1.xml.gz");
files.put("title 2","resource2.xml.gz");
files.put("title 3","resource3.xml.gz");
CreateIndex createIndex = new CreateIndex(configFile, files);
Basic search
With the created index and for CQL expression [pos="LID"]
String cql = "[pos=\"LID\"]"; Directory directory = createIndex.getDirectory();
the number of hits in each document can be computed with
IndexReader indexReader = DirectoryReader.open(directory);
MtasSpanQuery q = createQuery(CreateIndex.FIELD_CONTENT, cql, null, null);
ListIterator<LeafReaderContext> iterator = indexReader.leaves()
.listIterator();
IndexSearcher searcher = new IndexSearcher(indexReader);
SpanWeight spanweight = ((MtasSpanQuery) q.rewrite(indexReader))
.createWeight(searcher, false);
while (iterator.hasNext()) {
LeafReaderContext lrc = iterator.next();
Spans spans = spanweight.getSpans(lrc, SpanWeight.Postings.POSITIONS);
SegmentReader r = (SegmentReader) lrc.reader();
if (spans != null) {
while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
if (r.numDocs()==r.maxDoc() || r.getLiveDocs().get(spans.docID())) {
System.out.print("Document "+(lrc.docBase+spans.docID())+": ");
int hits = 0;
while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
hits++;
}
System.out.println(hits+" hits in '"+r.document((lrc.docBase+spans.docID())).get(CreateIndex.FIELD_TITLE)+"'");
}
}
}
}
indexReader.close();
Advanced search
By using the provided collect method, also more advanced options are available, like computing the termvector
IndexReader indexReader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(indexReader);
ComponentField fieldStats = new ComponentField(CreateIndex.FIELD_CONTENT, CreateIndex.FIELD_ID);
ArrayList<Integer> fullDocSet = new ArrayList<Integer>(Arrays.asList(new Integer[]{0,1,2}));
ArrayList<Integer> fullDocList = new ArrayList<Integer>();
try {
fieldStats.termVectorList.add(new ComponentTermVector("wordList", "t", null, false, "n,sum", CodecUtil.STATS_TYPE_SUM, CodecUtil.SORT_DESC, null, 10, null, null, null, null, null, null, null, null, null));
CodecUtil.collect(CreateIndex.FIELD_CONTENT, searcher, indexReader, fullDocList,
fullDocSet, fieldStats);
for (ComponentTermVector ct : fieldStats.termVectorList) {
HashMap<String, Map<String, Object>> tvList = new HashMap<String, Map<String, Object>>();
Map<String, ?> tcList = ct.subComponentFunction.dataCollector
.getResult().getList();
for (String key : tcList.keySet()) {
tvList.put(key,
((MtasDataItem<?, ?>) tcList.get(key)).rewrite(false));
}
System.out.println(tvList);
}
} catch (IllegalAccessException | IllegalArgumentException
| InvocationTargetException | mtas.parser.function.ParseException e) {
e.printStackTrace();
}
Appendix
Code class CreateIndex
import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
public class CreateIndex {
public static String FIELD_ID = "id";
public static String FIELD_TITLE = "title";
public static String FIELD_CONTENT = "content";
private Directory directory;
public CreateIndex(String configFile, HashMap<String, String> files)
throws IOException {
this(null, configFile, files);
}
public CreateIndex(String indexPath, String configFile,
HashMap<String, String> files) throws IOException {
initialize(null, configFile, files);
}
public Directory getDirectory() {
return directory;
}
private void initialize(String indexPath, String configFile,
HashMap<String, String> files) throws IOException {
if (indexPath != null) {
directory = FSDirectory.open(Paths.get(indexPath));
} else {
directory = new RAMDirectory();
}
Map<String, String> paramsCharFilterMtas = new HashMap<String, String>();
paramsCharFilterMtas.put("type", "file");
Map<String, String> paramsTokenizer = new HashMap<String, String>();
paramsTokenizer.put("configFile", configFile);
Analyzer mtasAnalyzer = CustomAnalyzer
.builder(Paths.get("docker").toAbsolutePath())
.addCharFilter("mtas", paramsCharFilterMtas)
.withTokenizer("mtas", paramsTokenizer).build();
Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
analyzerPerField.put(FIELD_CONTENT, mtasAnalyzer);
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
new StandardAnalyzer(), analyzerPerField);
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setUseCompoundFile(false);
config.setCodec(Codec.forName("MtasCodec"));
IndexWriter w = new IndexWriter(directory, config);
w.deleteAll();
int counter = 0;
for (String title : files.keySet()) {
Document doc = new Document();
doc.add(new StringField(FIELD_ID, Integer.valueOf(counter).toString(),
Field.Store.YES));
doc.add(new StringField(FIELD_TITLE, title, Field.Store.YES));
doc.add(new TextField(FIELD_CONTENT, files.get(title), Field.Store.YES));
w.addDocument(doc);
counter++;
}
w.commit();
w.close();
}
}
Code method createQuery
MtasSpanQuery createQuery(String field, String cql,
MtasSpanQuery ignore, Integer maximumIgnoreLength) throws ParseException {
Reader reader = new BufferedReader(new StringReader(cql));
MtasCQLParser p = new MtasCQLParser(reader);
return p.parse(field, null, null, ignore, maximumIgnoreLength);
}