m_userHeaders = null;
/**
- * Creates a new ParseContext Object with an entry with this {@link #CrawlerContext} configuration. This method is only for convenience.
+ * Creates a new ParseContext Object with an entry with this {@link CrawlerContext} configuration. This method is only for convenience.
*
* @return the created ParseContext Object.
*/
diff --git a/src/main/java/de/dfki/km/leech/io/ImapURLStreamProvider.java b/src/main/java/de/dfki/km/leech/io/ImapURLStreamProvider.java
index 5ee6b69..92ef3d1 100644
--- a/src/main/java/de/dfki/km/leech/io/ImapURLStreamProvider.java
+++ b/src/main/java/de/dfki/km/leech/io/ImapURLStreamProvider.java
@@ -18,6 +18,17 @@
+import com.sun.mail.imap.IMAPFolder;
+import com.sun.mail.imap.IMAPMessage;
+import de.dfki.km.leech.detect.DatasourceMediaTypes;
+import de.dfki.km.leech.parser.ImapCrawlerParser;
+import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
+import de.dfki.km.leech.util.UrlUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+import javax.mail.*;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
@@ -27,24 +38,6 @@
import java.util.logging.Level;
import java.util.logging.Logger;
-import javax.mail.Folder;
-import javax.mail.Message;
-import javax.mail.MessagingException;
-import javax.mail.Store;
-import javax.mail.URLName;
-
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-
-import com.sun.mail.imap.IMAPFolder;
-import com.sun.mail.imap.IMAPMessage;
-
-import de.dfki.km.leech.detect.DatasourceMediaTypes;
-import de.dfki.km.leech.parser.ImapCrawlerParser;
-import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
-import de.dfki.km.leech.util.UrlUtil;
-
public class ImapURLStreamProvider extends URLStreamProvider
diff --git a/src/main/java/de/dfki/km/leech/lucene/LeechDefaultFieldConfig.java b/src/main/java/de/dfki/km/leech/lucene/LeechDefaultFieldConfig.java
index b07e5c5..bd319e0 100644
--- a/src/main/java/de/dfki/km/leech/lucene/LeechDefaultFieldConfig.java
+++ b/src/main/java/de/dfki/km/leech/lucene/LeechDefaultFieldConfig.java
@@ -2,8 +2,13 @@
-import de.dfki.inquisition.lucene.DynamicFieldType;
-import de.dfki.inquisition.lucene.FieldConfig;
+// import de.dfki.inquisitor.lucene.DynamicFieldType;
+// import de.dfki.inquisitor.lucene.FieldConfig;
+
+
+
+import de.dfki.km.leech.lucene.basic.DynamicFieldType;
+import de.dfki.km.leech.lucene.basic.FieldConfig;
diff --git a/src/main/java/de/dfki/km/leech/lucene/LeechSimpleAnalyzer.java b/src/main/java/de/dfki/km/leech/lucene/LeechSimpleAnalyzer.java
new file mode 100644
index 0000000..0180ffd
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/LeechSimpleAnalyzer.java
@@ -0,0 +1,40 @@
+package de.dfki.km.leech.lucene;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.LowerCaseTokenizer;
+import org.apache.lucene.util.Version;
+
+
+
+/**
+ * An {@link Analyzer} that filters {@link LetterOrDigitLowerCaseTokenizer} with {@link LowerCaseFilter}
+ **/
+public class LeechSimpleAnalyzer extends Analyzer
+{
+
+ static final protected LeechSimpleAnalyzer m_singelton = new LeechSimpleAnalyzer();
+
+ static public LeechSimpleAnalyzer getSingleton()
+ {
+ return m_singelton;
+ }
+
+
+ /**
+ * Creates a new {@link LeechSimpleAnalyzer}
+ */
+ public LeechSimpleAnalyzer()
+ {
+ }
+
+
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName)
+ {
+ return new TokenStreamComponents(new LetterOrDigitLowerCaseTokenizer());
+ }
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/LetterOrDigitLowerCaseTokenizer.java b/src/main/java/de/dfki/km/leech/lucene/LetterOrDigitLowerCaseTokenizer.java
new file mode 100644
index 0000000..298ab6a
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/LetterOrDigitLowerCaseTokenizer.java
@@ -0,0 +1,55 @@
+package de.dfki.km.leech.lucene;
+
+
+
+import org.apache.lucene.analysis.util.CharTokenizer;
+import org.apache.lucene.util.AttributeFactory;
+
+
+
+/**
+ * Tokenizer that tokenizes between letter and digit entries. The chars will also be converted to lower case.
+ *
+ * Note: this does a decent job for most European languages, but does a terrible job for some Asian languages, where words maybe are not separated by
+ * spaces, etc.
+ *
+ * @author Christian Reuschling, Dipl.Ing.(BA)
+ */
+public class LetterOrDigitLowerCaseTokenizer extends CharTokenizer
+{
+
+ public LetterOrDigitLowerCaseTokenizer(AttributeFactory factory)
+ {
+ super(factory);
+ }
+
+
+
+ public LetterOrDigitLowerCaseTokenizer()
+ {
+ super();
+ }
+
+
+
+
+ /**
+ * Collects only characters which satisfy {@link Character#isLetterOrDigit(int)}.
+ */
+ @Override
+ protected boolean isTokenChar(int c)
+ {
+ return Character.isLetterOrDigit(c);
+ }
+
+
+
+ /**
+ * Converts char to lower case {@link Character#toLowerCase(int)}.
+ */
+ @Override
+ protected int normalize(int c)
+ {
+ return Character.toLowerCase(c);
+ }
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/ToLuceneContentHandler.java b/src/main/java/de/dfki/km/leech/lucene/ToLuceneContentHandler.java
index 6b9207c..b4e5ada 100644
--- a/src/main/java/de/dfki/km/leech/lucene/ToLuceneContentHandler.java
+++ b/src/main/java/de/dfki/km/leech/lucene/ToLuceneContentHandler.java
@@ -1,16 +1,16 @@
/*
* Leech - crawling capabilities for Apache Tika
- *
+ *
* Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
- *
+ *
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
* either version 3 of the License, or (at your option) any later version.
- *
+ *
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU General Public License along with this program. If not, see .
- *
+ *
* Contact us by mail: christian.reuschling@dfki.de
*/
@@ -18,60 +18,47 @@
+import de.dfki.inquisitor.collections.MultiValueHashMap;
+import de.dfki.inquisitor.file.FileUtilz;
+// import de.dfki.inquisitor.lucene.FieldConfig;
+import de.dfki.km.leech.Leech;
+import de.dfki.km.leech.lucene.basic.FieldConfig;
+import de.dfki.km.leech.metadata.LeechMetadata;
+import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
+import de.dfki.km.leech.sax.DataSinkContentHandler;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.*;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.store.SimpleFSDirectory;
+import org.apache.tika.metadata.Metadata;
+
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.rmi.server.UID;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
import java.util.Map.Entry;
-import java.util.UUID;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.index.CorruptIndexException;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.IndexWriterConfig.OpenMode;
-import org.apache.lucene.index.IndexableField;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.store.LockObtainFailedException;
-import org.apache.lucene.store.SimpleFSDirectory;
-import org.apache.lucene.util.Version;
-import org.apache.tika.metadata.Metadata;
-
-import de.dfki.inquisition.collections.MultiValueHashMap;
-import de.dfki.inquisition.file.FileUtils;
-import de.dfki.inquisition.lucene.FieldConfig;
-import de.dfki.km.leech.Leech;
-import de.dfki.km.leech.metadata.LeechMetadata;
-import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
-import de.dfki.km.leech.sax.DataSinkContentHandler;
-
/**
* This is a content handler that allows to store crawled data into a Lucene index. You are able to configure the field types and the analyzers that should be used.
- * Further, blockindexing with {@link IndexWriter#addDocuments(java.util.Collection, Analyzer)} is supported, you can enable it with
+ * Further, blockindexing with {@link IndexWriter#addDocuments(Iterable)} is supported, you can enable it with
* {@link ToLuceneContentHandler#setBlockIndexing(boolean)}. If it is enabled, {@link ToLuceneContentHandler} checks whether inside the metadata is a
* {@link LeechMetadata#childId} or a {@link LeechMetadata#parentId} key. Documents with a {@link LeechMetadata#childId} entry will appear as parent documents, docs with
* an {@link LeechMetadata#parentId} as childs. {@link ToLuceneContentHandler} collects the child documents if they appear at a processXXX method, and writes them as
* block at the time a succeeding parent document appears. In the case a non-parent doc appears, all collected docs will be indexed normally, not as block.
- *
+ *
* @author Christian Reuschling, Dipl.Ing.(BA)
- *
*/
public class ToLuceneContentHandler extends DataSinkContentHandler
{
@@ -90,7 +77,7 @@ public void run()
{
List llDocs = m_addDocsQueue.take();
- if(llDocs instanceof InterruptThreadList)
+ if (llDocs instanceof InterruptThreadList)
{
break;
}
@@ -99,48 +86,36 @@ public void run()
{
- if(llDocs.size() == 1)
+ if (llDocs.size() == 1)
{
getCurrentWriter().addDocument(llDocs.get(0));
}
- else if(llDocs.size() > 1)
+ else if (llDocs.size() > 1)
{
getCurrentWriter().addDocuments(llDocs);
}
-
- }
- catch (Exception e)
+ } catch (Exception e)
{
- Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(
- Level.WARNING,
- "Error during writing a document to the index (lucene exception while addDocument) - will ignore it. This is a hint to a lucene bug."
- + llDocs);
+ Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.WARNING,
+ "Error during writing a document to the index (lucene exception while addDocument) - will ignore it. This is a hint to a lucene bug." + llDocs);
}
-
}
- }
- catch (InterruptedException e)
+ } catch (InterruptedException e)
{
// NOP
- }
- catch (Exception e)
+ } catch (Exception e)
{
Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.SEVERE, "Error", e);
-
- }
- finally
+ } finally
{
try
{
m_cyclicBarrier4DocConsumerThreads.await();
- }
- catch (Exception e2)
+ } catch (Exception e2)
{
Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.SEVERE, "Error", e2);
}
}
-
-
}
}
@@ -266,6 +241,21 @@ public ToLuceneContentHandler(Metadata metadata, int writeLimit, FieldConfig fie
+ protected void addStaticAttValuePairs(Document doc) throws Exception
+ {
+ for (Entry fieldName2Value : getStaticAttributeValuePairs().entryList())
+ {
+ IndexableField field = m_fieldConfig.createField(fieldName2Value.getKey(), fieldName2Value.getValue());
+ if (field != null)
+ doc.add(field);
+ else
+ Logger.getLogger(ToLuceneContentHandler.class.getName())
+ .warning("Could not create lucene field for " + fieldName2Value.getKey() + ":" + fieldName2Value.getValue() + ". Will ignore it.");
+ }
+ }
+
+
+
/**
* Will merge all temporar indices together into the initial indexWriter index. This is only necessary if SplitAndMerge is enabled. Otherwise you don't have to invoke
* this method.
@@ -283,12 +273,13 @@ public void crawlFinished()
m_llConsumerThreads.clear();
- if(getSplitAndMergeIndex() <= 0) return;
+ if (getSplitAndMergeIndex() <= 0)
+ return;
// hier mergen wir nun alle temporären indices in den originalen
// der temporären müssen noch geschlossen werden - das machen wir jetzt. Der letzte steht noch nicht in der Liste
- if(m_luceneWriter != m_initialLuceneWriter)
+ if (m_luceneWriter != m_initialLuceneWriter)
{
for (IndexWriter writer2close : m_llIndexWriter2Close)
writer2close.close();
@@ -300,7 +291,8 @@ public void crawlFinished()
for (String strTmpPath : m_hsTmpLuceneWriterPaths2Merge)
llIndicesDirs2Merge.add(new SimpleFSDirectory(Paths.get(strTmpPath)));
- if(llIndicesDirs2Merge.size() == 0) return;
+ if (llIndicesDirs2Merge.size() == 0)
+ return;
Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Will merge " + llIndicesDirs2Merge.size() + " temporary indices to the final one.");
@@ -310,15 +302,175 @@ public void crawlFinished()
m_initialLuceneWriter.commit();
for (String strTmpPath : m_hsTmpLuceneWriterPaths2Merge)
- FileUtils.deleteDirectory(new File(strTmpPath));
+ FileUtilz.deleteDirectory(new File(strTmpPath));
+ } catch (Exception e)
+ {
+ Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e);
+ }
+ }
+
+
+
+ /**
+ * Returns null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)})
+ *
+ * @param metadata
+ * @param strFulltext
+ *
+ * @return null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)})
+ *
+ * @throws Exception
+ */
+ protected Document createAndFillLuceneDocument(Metadata metadata, String strFulltext) throws Exception
+ {
+ // // wir erstellen kein Document-Object neu, wenn es nicht unbedingt nötig ist - dazu merken wir uns die Referenzen auf die schon allokierten
+ // // Document Objekte
+ // // Document Object reuse
+ // Document doc = null;
+ // for (Document preAllocatedDoc : m_llAllocatedDocuments)
+ // {
+ // if(!m_llLastChildDocuments.contains(preAllocatedDoc))
+ // {
+ // doc = preAllocatedDoc;
+ // LinkedList llFieldNames = new
+ // for (Fieldable field : doc.getFields())
+ // doc.removeFields(field.name());
+ //
+ // break;
+ // }
+ // }
+ // if(doc == null)
+ // {
+ // doc = new Document();
+ // m_llAllocatedDocuments.add(doc);
+ // }
+
+ Document doc = new Document();
+
+ // Das man kein Field aus einem reader machen kann ist der Grund, warum processNewMetaData den Fulltext als String und nicht als reader
+ // übergibt
+
+ // eine eindeutige ID muß da sein
+ if (metadata.getValues(LeechMetadata.id).length == 0)
+ doc.add(m_fieldConfig.createField(LeechMetadata.id, new UID().toString()));
+ if (!getFields2Ignore().contains(LeechMetadata.body))
+ doc.add(m_fieldConfig.createField(LeechMetadata.body, strFulltext));
+ // die kopien
+ for (String strFieldCopy : getFieldCopyMap().get(LeechMetadata.body))
+ if (!getFields2Ignore().contains(strFieldCopy))
+ doc.add(m_fieldConfig.createField(strFieldCopy, strFulltext));
+
+
+ // die restlichen metadaten
+ for (String strFieldName : metadata.names())
+ {
+ if (!getFields2Ignore().contains(strFieldName))
+ {
+ for (String strValue : metadata.getValues(strFieldName))
+ {
+ IndexableField field = m_fieldConfig.createField(strFieldName, strValue);
+ if (field != null)
+ doc.add(field);
+ else
+ Logger.getLogger(ToLuceneContentHandler.class.getName())
+ .warning("Could not create lucene field for " + strFieldName + ":" + strValue + ". Will ignore it.");
+ }
+ }
+
+ // die kopien
+ for (String strFieldCopy : getFieldCopyMap().get(strFieldName))
+ if (!getFields2Ignore().contains(strFieldCopy))
+ {
+ for (String strValue : metadata.getValues(strFieldName))
+ {
+ IndexableField field = m_fieldConfig.createField(strFieldCopy, strValue);
+ if (field != null)
+ doc.add(field);
+ else
+ Logger.getLogger(ToLuceneContentHandler.class.getName())
+ .warning("Could not create lucene field for " + strFieldCopy + ":" + strValue + ". Will ignore it.");
+ }
+ }
}
- catch (Exception e)
+
+ // die statischen Attribut-Value-Paare
+ addStaticAttValuePairs(doc);
+
+ // und jetzt aggregieren wir noch
+ for (String strTargetAtt : getFieldAggregationMap().keySet())
{
- Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e);
+ // wenn es das TargetAtt schon im doc gibt, dann aggregieren wir nix
+ if (doc.get(strTargetAtt) != null)
+ continue;
+
+ Collection colSourceAtts = getFieldAggregationMap().get(strTargetAtt);
+
+ for (String strSourceAtt : colSourceAtts)
+ {
+ String strNewValue = metadata.get(strSourceAtt);
+ if (strNewValue == null)
+ strNewValue = getStaticAttributeValuePairs().getFirst(strSourceAtt);
+
+ if (strNewValue != null)
+ {
+ IndexableField field = m_fieldConfig.createField(strTargetAtt, strNewValue);
+ if (field != null)
+ doc.add(field);
+ else
+ Logger.getLogger(ToLuceneContentHandler.class.getName())
+ .warning("Could not create lucene field for " + strTargetAtt + ":" + strNewValue + ". Will ignore it.");
+
+ break;
+ }
+ }
+ }
+
+
+
+ // wenn ein Doc nicht unseren constraints entspricht, dann ignorieren wir das hier, indem wir null zurück geben
+ if (m_hsFieldName2FieldValueConstraint == null || m_hsFieldName2FieldValueConstraint.size() == 0)
+ return doc;
+
+ for (Entry fieldname2fieldValRegEx : m_hsFieldName2FieldValueConstraint.entrySet())
+ {
+ IndexableField[] fieldables = doc.getFields(fieldname2fieldValRegEx.getKey());
+ for (IndexableField fieldable : fieldables)
+ {
+ String strVal = fieldable.stringValue();
+ if (strVal.matches(fieldname2fieldValRegEx.getValue()))
+ {
+ // wir haben einen Treffer
+ return doc;
+ }
+ }
}
+
+ return null;
+ }
+
+
+
+ protected void ensureConsumerThreadsRunning()
+ {
+ if (m_llConsumerThreads.size() != 0)
+ return;
+
+ int iCoreCount = Runtime.getRuntime().availableProcessors();
+ int iThreadCount = (int) Math.round(iCoreCount / 2d);
+ iThreadCount = Math.max(iThreadCount, 1);
+
+ m_cyclicBarrier4DocConsumerThreads = new CyclicBarrier(iThreadCount + 1);
+ for (int i = 0; i < iThreadCount; i++)
+ {
+ Thread consumerThread = new Thread(new DocConsumer(), "ToLuceneContentHandlerDocConsumer " + i);
+ m_llConsumerThreads.add(consumerThread);
+ consumerThread.setDaemon(true);
+
+ consumerThread.start();
+ }
}
@@ -330,11 +482,58 @@ public boolean getBlockIndexing()
+ synchronized protected IndexWriter getCurrentWriter() throws CorruptIndexException, LockObtainFailedException, IOException
+ {
+
+
+ if (getSplitAndMergeIndex() <= 0)
+ return m_initialLuceneWriter;
+
+ if (m_luceneWriter.maxDoc() < getSplitAndMergeIndex())
+ return m_luceneWriter;
+
+
+ Directory directory = m_initialLuceneWriter.getDirectory();
+
+ Path fOurTmpDir = null;
+ if (directory instanceof FSDirectory)
+ {
+ if (m_luceneWriter != m_initialLuceneWriter)
+ m_llIndexWriter2Close.add(m_luceneWriter);
+
+ String strTmpPath = ((FSDirectory) directory).getDirectory().toAbsolutePath().toString();
+ // if(strTmpPath.charAt(strTmpPath.length() - 1) == '/' || strTmpPath.charAt(strTmpPath.length() - 1) == '\\')
+ // strTmpPath = strTmpPath.substring(0, strTmpPath.length() - 1);
+ strTmpPath += "_" + (m_hsTmpLuceneWriterPaths2Merge.size() + 1);
+ fOurTmpDir = Paths.get(strTmpPath);
+ }
+ else
+ {
+ // wir brauchen was temporäres
+ File parentDir = new File(System.getProperty("java.io.tmpdir"));
+ fOurTmpDir = Paths.get(parentDir.getAbsolutePath() + "/leechTmp/" + UUID.randomUUID().toString().replaceAll("\\W", "_"));
+ }
+
+ Logger.getLogger(ToLuceneContentHandler.class.getName())
+ .info("Current index exceeds " + m_iSplitIndexDocumentCount + " documents. Will create another temporary one under " + fOurTmpDir);
+
+
+ @SuppressWarnings("deprecation") IndexWriterConfig config = new IndexWriterConfig(m_initialLuceneWriter.getConfig().getAnalyzer());
+ config.setOpenMode(OpenMode.CREATE);
+
+ m_luceneWriter = new IndexWriter(new SimpleFSDirectory(fOurTmpDir), config);
+ m_hsTmpLuceneWriterPaths2Merge.add(fOurTmpDir.toAbsolutePath().toString());
+
+ return m_luceneWriter;
+ }
+
+
+
/**
* Gets the field aggregation map. This means that you want to generate a field entry, whereby its value should be copied from another, existing metadata entry. You
* can specify a list of these source-attributes, the first who have an entry wins and appears as new attribute, so the source field name list is in fact a priorized
* list.
- *
+ *
* @return the current field aggregation map
*/
public MultiValueHashMap getFieldAggregationMap()
@@ -346,7 +545,7 @@ public MultiValueHashMap getFieldAggregationMap()
/**
* Gets the field config
- *
+ *
* @return the field config
*/
public FieldConfig getFieldConfig()
@@ -360,7 +559,7 @@ public FieldConfig getFieldConfig()
* Gets the field copy mappings. This means that the content of every metadata key that is specified as key inside hsSource2TargetFieldnames will be copied into
* several other fields. The field names of these fields are specified as corresponding value inside hsSource2TargetFieldnames. In the case you want to rename
* attribute names, specify a field mapping and ignore the source field name with {@link #setFieldNames2Ignore(HashSet)}
- *
+ *
* @return the current field mappings
*/
public MultiValueHashMap getFieldCopyMap()
@@ -372,7 +571,7 @@ public MultiValueHashMap getFieldCopyMap()
/**
* Gets the set of field names / metadata key values that will NOT be stored into the lucene index.
- *
+ *
* @return the set of field names / metadata key values that will NOT be stored into the lucene index.
*/
public HashSet getFields2Ignore()
@@ -384,7 +583,7 @@ public HashSet getFields2Ignore()
/**
* All docs without at least one of the given fieldname-value pairs will be ignored. You can specif regular expressions as field values
- *
+ *
* @return the fieldname-value pairs. At least one have to match that a document will be written into the index
*/
public Map getIgnoreAllDocsWithout()
@@ -400,7 +599,7 @@ public Map getIgnoreAllDocsWithout()
* writing, until this one also gets 'overfilled'. In the case your crawl is finished, {@link Leech} invokes {@link ToLuceneContentHandler#crawlFinished()}. This will
* merge all temporary indices into the initial indexWriter object. This is for performance reasons because writing into a Lucene index tends to get slow after a
* certain size. Splitting and merging afterwards is faster.
- *
+ *
* @return the document count a new index will be created
*/
public int getSplitAndMergeIndex()
@@ -412,7 +611,7 @@ public int getSplitAndMergeIndex()
/**
* Sets some attribute value pairs that will be added to every crawled document.
- *
+ *
* @return the current static attribute value pairs
*/
public MultiValueHashMap getStaticAttributeValuePairs()
@@ -422,6 +621,16 @@ public MultiValueHashMap getStaticAttributeValuePairs()
+ @Override
+ protected void init()
+ {
+ Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Will write crawled data into " + m_luceneWriter.getDirectory().toString());
+
+ ensureConsumerThreadsRunning();
+ }
+
+
+
@Override
public void processErrorData(Metadata metadata)
{
@@ -439,20 +648,17 @@ public void processModifiedData(Metadata metadata, String strFulltext)
// hier modifizieren wir ein schon vorhandenes Dokument
Document luceneDocument = createAndFillLuceneDocument(metadata, strFulltext);
- if(luceneDocument == null) return;
+ if (luceneDocument == null)
+ return;
// TODO: was passiert hier mit block-indexierten Dokumenten?
- m_initialLuceneWriter
- .updateDocument(new Term(IncrementalCrawlingHistory.dataEntityId, metadata.get(IncrementalCrawlingHistory.dataEntityId)), luceneDocument);
-
- }
- catch (Exception e)
+ m_initialLuceneWriter.updateDocument(new Term(IncrementalCrawlingHistory.dataEntityId, metadata.get(IncrementalCrawlingHistory.dataEntityId)), luceneDocument);
+ } catch (Exception e)
{
Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error during writing into the index", e);
}
-
}
@@ -463,7 +669,8 @@ public void processNewData(Metadata metadata, String strFulltext)
try
{
- if(m_initialLuceneWriter == null) throw new IllegalStateException("Lucene writer was not specified");
+ if (m_initialLuceneWriter == null)
+ throw new IllegalStateException("Lucene writer was not specified");
m_luceneWriter = getCurrentWriter();
@@ -471,7 +678,8 @@ public void processNewData(Metadata metadata, String strFulltext)
Document doc = createAndFillLuceneDocument(metadata, strFulltext);
- if(doc == null) return;
+ if (doc == null)
+ return;
@@ -480,16 +688,16 @@ public void processNewData(Metadata metadata, String strFulltext)
// - wenn wir auf ein Doc ohne parent-oder child-Id stossen, dann schreiben wir alle bisherigen Docs als Einzeldokumente raus - nicht im
// Block
- if(ToLuceneContentHandler.this.getBlockIndexing())
+ if (ToLuceneContentHandler.this.getBlockIndexing())
{
- if(metadata.get(LeechMetadata.parentId) != null)
+ if (metadata.get(LeechMetadata.parentId) != null)
{
// wir haben ein child-Doc (wir haben eine Referenz zu unserem parent). Das merken wir uns einfach
m_llLastChildDocuments.add(doc);
}
- else if(metadata.get(LeechMetadata.childId) != null)
+ else if (metadata.get(LeechMetadata.childId) != null)
{
// wir haben ein parentDoc (ein parent hat min eine childId) - wir schreiben zusammen mit den bisher gesammelten im block. Das
// parentDoc ist das letzte
@@ -507,24 +715,15 @@ else if(metadata.get(LeechMetadata.childId) != null)
m_addDocsQueue.put(Collections.singletonList(doc));
}
-
-
}
else
{
m_addDocsQueue.put(Collections.singletonList(doc));
}
-
-
-
-
-
- }
- catch (Exception e)
+ } catch (Exception e)
{
Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e);
}
-
}
@@ -534,13 +733,15 @@ public void processNewDocument(Document doc)
try
{
- if(m_initialLuceneWriter == null) throw new IllegalStateException("Lucene writer was not specified");
+ if (m_initialLuceneWriter == null)
+ throw new IllegalStateException("Lucene writer was not specified");
m_luceneWriter = getCurrentWriter();
ensureConsumerThreadsRunning();
- if(doc == null) return;
+ if (doc == null)
+ return;
@@ -549,16 +750,16 @@ public void processNewDocument(Document doc)
// - wenn wir auf ein Doc ohne parent-oder child-Id stossen, dann schreiben wir alle bisherigen Docs als Einzeldokumente raus - nicht im
// Block
- if(ToLuceneContentHandler.this.getBlockIndexing())
+ if (ToLuceneContentHandler.this.getBlockIndexing())
{
- if(doc.get(LeechMetadata.parentId) != null)
+ if (doc.get(LeechMetadata.parentId) != null)
{
// wir haben ein child-Doc (wir haben eine Referenz zu unserem parent). Das merken wir uns einfach
m_llLastChildDocuments.add(doc);
}
- else if(doc.get(LeechMetadata.childId) != null)
+ else if (doc.get(LeechMetadata.childId) != null)
{
// wir haben ein parentDoc (ein parent hat min eine childId) - wir schreiben zusammen mit den bisher gesammelten im block. Das
// parentDoc ist das letzte
@@ -576,24 +777,15 @@ else if(doc.get(LeechMetadata.childId) != null)
m_addDocsQueue.put(Collections.singletonList(doc));
}
-
-
}
else
{
m_addDocsQueue.put(Collections.singletonList(doc));
}
-
-
-
-
-
- }
- catch (Exception e)
+ } catch (Exception e)
{
Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e);
}
-
}
@@ -616,18 +808,14 @@ public void processRemovedData(Metadata metadata)
// TODO: was passiert hier mit block-indexierten Dokumenten?
m_initialLuceneWriter.deleteDocuments(new Term(IncrementalCrawlingHistory.dataEntityId, metadata.get(IncrementalCrawlingHistory.dataEntityId)));
-
- }
- catch (Exception e)
+ } catch (Exception e)
{
Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error during writing into the index", e);
}
-
}
-
@Override
public void processUnmodifiedData(Metadata metadata)
{
@@ -637,12 +825,12 @@ public void processUnmodifiedData(Metadata metadata)
/**
- * Sets whether block indexing with {@link IndexWriter#addDocuments(java.util.Collection, Analyzer)} is enabled or not. If it is enabled,
+ * Sets whether block indexing with {@link IndexWriter#addDocuments(Iterable)} is enabled or not. If it is enabled,
* {@link ToLuceneContentHandler} checks whether inside the metadata is a {@link LeechMetadata#childId} or a {@link LeechMetadata#parentId} key. Documents with a
* {@link LeechMetadata#childId} entry will appear as parent documents, docs with an {@link LeechMetadata#parentId} as childs. {@link ToLuceneContentHandler} collects
* the child documents if they appear at a processXXX method, and writes them as block at the time a succeeding parent document appears. In the case a non-parent doc
* appears, all collected docs will be indexed normally, not as block.
- *
+ *
* @param blockIndexing true in the case blockindexing should be inabled, false otherwise.
*/
public void setBlockIndexing(boolean blockIndexing)
@@ -656,7 +844,7 @@ public void setBlockIndexing(boolean blockIndexing)
* Sets the field aggregation map. This means that you want to generate a field entry, whereby its value should be copied from another, existing metadata entry. You
* can specify a list of these source-attributes, the first who have an entry wins and appears as new attribute, so the source field name list is in fact a priorized
* list.
- *
+ *
* @param hsTarget2SourcesFieldnames the field aggregation map
*/
public void setFieldAggregationMap(MultiValueHashMap hsTarget2SourcesFieldnames)
@@ -666,16 +854,13 @@ public void setFieldAggregationMap(MultiValueHashMap hsTarget2So
-
-
-
/**
* Sets the field copy mappings. This means that the content of every metadata key that is specified as key inside hsSource2TargetFieldnames will be copied into
* several other fields. The field names of these fields are specified as corresponding value inside hsSource2TargetFieldnames. In the case you want to rename
* attribute names, specify a field mapping and ignore the source field name with {@link #setFieldNames2Ignore(HashSet)}
- *
+ *
* @param hsSource2TargetFieldnames keys: source field names, given as metadata keys. values: target field names - the content will also appear under these fields
- * inside a lucene document
+ * inside a lucene document
*/
public void setFieldCopyMap(MultiValueHashMap hsSource2TargetFieldnames)
{
@@ -687,7 +872,7 @@ public void setFieldCopyMap(MultiValueHashMap hsSource2TargetFie
/**
* Sets the set of field names / metadata key values that will NOT be stored into the lucene index. Nevertheless, you can consider these in
* {@link #setFieldCopyMap(MultiValueHashMap)}. In this case you have 'moved' the attribute value into another attribute (or several ones).
- *
+ *
* @param hsAttNamesNot2Store the set of attribute/field names that will not stored into the lucene index
*/
public void setFieldNames2Ignore(HashSet hsAttNamesNot2Store)
@@ -700,9 +885,9 @@ public void setFieldNames2Ignore(HashSet hsAttNamesNot2Store)
/**
* All docs without at least one of the given fieldname-value pairs will be ignored. You can specif regular expressions as field values. If this is set to null or to
* an empty map, all documents will be accepted.
- *
+ *
* @param hsFieldName2FieldValue the fieldname-value pairs. At least one have to match that a document will be written into the index
- *
+ *
* @return this
*/
public ToLuceneContentHandler setIgnoreAllDocsWithout(Map hsFieldName2FieldValue)
@@ -714,7 +899,6 @@ public ToLuceneContentHandler setIgnoreAllDocsWithout(Map hsFiel
-
/**
* If split and merge is enabled, {@link ToLuceneContentHandler} will check at each {@link #processNewData(Metadata, String)} invocation whether the current
* indexWriter has more than iSplitIndexDocumentCount documents. In the case it has more, {@link ToLuceneContentHandler} will create an entirely new index for
@@ -722,10 +906,10 @@ public ToLuceneContentHandler setIgnoreAllDocsWithout(Map hsFiel
* indices into the initial indexWriter object. This invocation will be done automatically by the {@link Leech} class. This is for performance reasons because writing
* into a Lucene index tends to get slow after a certain size. Splitting and merging afterwards is faster. Update: this behaviour depends on the Lucene version used,
* currently this seems to be not a problem. Thus, this functionality is disabled per default.
- *
+ *
* @param iSplitIndexDocumentCount the document count a new index will be created. A good size is 500 000 (from my stomach feeling, if it is necessary). -1 in the
- * case you want to disable SplitAndMerge, which is the default.
- *
+ * case you want to disable SplitAndMerge, which is the default.
+ *
* @return this
*/
public ToLuceneContentHandler setSplitAndMergeIndex(int iSplitIndexDocumentCount)
@@ -739,9 +923,9 @@ public ToLuceneContentHandler setSplitAndMergeIndex(int iSplitIndexDocumentCount
/**
* Sets some attribute value pairs that will be added to every crawled document.
- *
+ *
* @param hsStaticAttValuePairs a multi value map containing the additional attribute value pairs
- *
+ *
* @return this
*/
public ToLuceneContentHandler setStaticAttributeValuePairs(MultiValueHashMap hsStaticAttValuePairs)
@@ -750,244 +934,4 @@ public ToLuceneContentHandler setStaticAttributeValuePairs(MultiValueHashMap fieldName2Value : getStaticAttributeValuePairs().entryList())
- {
- IndexableField field = m_fieldConfig.createField(fieldName2Value.getKey(), fieldName2Value.getValue());
- if(field != null)
- doc.add(field);
- else
- Logger.getLogger(ToLuceneContentHandler.class.getName()).warning(
- "Could not create lucene field for " + fieldName2Value.getKey() + ":" + fieldName2Value.getValue() + ". Will ignore it.");
- }
- }
-
-
-
-
- /**
- * Returns null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)})
- *
- * @param metadata
- * @param strFulltext
- *
- * @return null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)})
- *
- * @throws Exception
- */
- protected Document createAndFillLuceneDocument(Metadata metadata, String strFulltext) throws Exception
- {
- // // wir erstellen kein Document-Object neu, wenn es nicht unbedingt nötig ist - dazu merken wir uns die Referenzen auf die schon allokierten
- // // Document Objekte
- // // Document Object reuse
- // Document doc = null;
- // for (Document preAllocatedDoc : m_llAllocatedDocuments)
- // {
- // if(!m_llLastChildDocuments.contains(preAllocatedDoc))
- // {
- // doc = preAllocatedDoc;
- // LinkedList llFieldNames = new
- // for (Fieldable field : doc.getFields())
- // doc.removeFields(field.name());
- //
- // break;
- // }
- // }
- // if(doc == null)
- // {
- // doc = new Document();
- // m_llAllocatedDocuments.add(doc);
- // }
-
- Document doc = new Document();
-
-
-
- // Das man kein Field aus einem reader machen kann ist der Grund, warum processNewMetaData den Fulltext als String und nicht als reader
- // übergibt
-
- // eine eindeutige ID muß da sein
- if(metadata.getValues(LeechMetadata.id).length == 0) doc.add(m_fieldConfig.createField(LeechMetadata.id, new UID().toString()));
- if(!getFields2Ignore().contains(LeechMetadata.body)) doc.add(m_fieldConfig.createField(LeechMetadata.body, strFulltext));
- // die kopien
- for (String strFieldCopy : getFieldCopyMap().get(LeechMetadata.body))
- if(!getFields2Ignore().contains(strFieldCopy)) doc.add(m_fieldConfig.createField(strFieldCopy, strFulltext));
-
-
- // die restlichen metadaten
- for (String strFieldName : metadata.names())
- {
- if(!getFields2Ignore().contains(strFieldName))
- {
- for (String strValue : metadata.getValues(strFieldName))
- {
- IndexableField field = m_fieldConfig.createField(strFieldName, strValue);
- if(field != null)
- doc.add(field);
- else
- Logger.getLogger(ToLuceneContentHandler.class.getName()).warning(
- "Could not create lucene field for " + strFieldName + ":" + strValue + ". Will ignore it.");
- }
-
- }
-
- // die kopien
- for (String strFieldCopy : getFieldCopyMap().get(strFieldName))
- if(!getFields2Ignore().contains(strFieldCopy))
- {
- for (String strValue : metadata.getValues(strFieldName))
- {
- IndexableField field = m_fieldConfig.createField(strFieldCopy, strValue);
- if(field != null)
- doc.add(field);
- else
- Logger.getLogger(ToLuceneContentHandler.class.getName()).warning(
- "Could not create lucene field for " + strFieldCopy + ":" + strValue + ". Will ignore it.");
- }
- }
- }
-
- // die statischen Attribut-Value-Paare
- addStaticAttValuePairs(doc);
-
- // und jetzt aggregieren wir noch
- for (String strTargetAtt : getFieldAggregationMap().keySet())
- {
- // wenn es das TargetAtt schon im doc gibt, dann aggregieren wir nix
- if(doc.get(strTargetAtt) != null) continue;
-
- Collection colSourceAtts = getFieldAggregationMap().get(strTargetAtt);
-
- for (String strSourceAtt : colSourceAtts)
- {
- String strNewValue = metadata.get(strSourceAtt);
- if(strNewValue == null) strNewValue = getStaticAttributeValuePairs().getFirst(strSourceAtt);
-
- if(strNewValue != null)
- {
- IndexableField field = m_fieldConfig.createField(strTargetAtt, strNewValue);
- if(field != null)
- doc.add(field);
- else
- Logger.getLogger(ToLuceneContentHandler.class.getName()).warning(
- "Could not create lucene field for " + strTargetAtt + ":" + strNewValue + ". Will ignore it.");
-
- break;
- }
- }
- }
-
-
-
- // wenn ein Doc nicht unseren constraints entspricht, dann ignorieren wir das hier, indem wir null zurück geben
- if(m_hsFieldName2FieldValueConstraint == null || m_hsFieldName2FieldValueConstraint.size() == 0) return doc;
-
- for (Entry fieldname2fieldValRegEx : m_hsFieldName2FieldValueConstraint.entrySet())
- {
- IndexableField[] fieldables = doc.getFields(fieldname2fieldValRegEx.getKey());
- for (IndexableField fieldable : fieldables)
- {
- String strVal = fieldable.stringValue();
- if(strVal.matches(fieldname2fieldValRegEx.getValue()))
- {
- // wir haben einen Treffer
- return doc;
- }
- }
- }
-
-
- return null;
- }
-
-
-
-
-
-
-
- protected void ensureConsumerThreadsRunning()
- {
- if(m_llConsumerThreads.size() != 0) return;
-
- int iCoreCount = Runtime.getRuntime().availableProcessors();
- int iThreadCount = (int) Math.round(iCoreCount / 2d);
- iThreadCount = Math.max(iThreadCount, 1);
-
- m_cyclicBarrier4DocConsumerThreads = new CyclicBarrier(iThreadCount + 1);
- for (int i = 0; i < iThreadCount; i++)
- {
- Thread consumerThread = new Thread(new DocConsumer(), "ToLuceneContentHandlerDocConsumer " + i);
- m_llConsumerThreads.add(consumerThread);
- consumerThread.setDaemon(true);
-
- consumerThread.start();
- }
- }
-
-
-
- synchronized protected IndexWriter getCurrentWriter() throws CorruptIndexException, LockObtainFailedException, IOException
- {
-
-
- if(getSplitAndMergeIndex() <= 0) return m_initialLuceneWriter;
-
- if(m_luceneWriter.maxDoc() < getSplitAndMergeIndex()) return m_luceneWriter;
-
-
- Directory directory = m_initialLuceneWriter.getDirectory();
-
- Path fOurTmpDir = null;
- if(directory instanceof FSDirectory)
- {
- if(m_luceneWriter != m_initialLuceneWriter) m_llIndexWriter2Close.add(m_luceneWriter);
-
- String strTmpPath = ((FSDirectory) directory).getDirectory().toAbsolutePath().toString();
- // if(strTmpPath.charAt(strTmpPath.length() - 1) == '/' || strTmpPath.charAt(strTmpPath.length() - 1) == '\\')
- // strTmpPath = strTmpPath.substring(0, strTmpPath.length() - 1);
- strTmpPath += "_" + (m_hsTmpLuceneWriterPaths2Merge.size() + 1);
- fOurTmpDir = Paths.get(strTmpPath);
- }
- else
- {
- // wir brauchen was temporäres
- File parentDir = new File(System.getProperty("java.io.tmpdir"));
- fOurTmpDir = Paths.get(parentDir.getAbsolutePath() + "/leechTmp/" + UUID.randomUUID().toString().replaceAll("\\W", "_"));
- }
-
- Logger.getLogger(ToLuceneContentHandler.class.getName()).info(
- "Current index exceeds " + m_iSplitIndexDocumentCount + " documents. Will create another temporary one under " + fOurTmpDir);
-
-
- @SuppressWarnings("deprecation")
- IndexWriterConfig config = new IndexWriterConfig(m_initialLuceneWriter.getConfig().getAnalyzer());
- config.setOpenMode(OpenMode.CREATE);
-
- m_luceneWriter = new IndexWriter(new SimpleFSDirectory(fOurTmpDir), config);
- m_hsTmpLuceneWriterPaths2Merge.add(fOurTmpDir.toAbsolutePath().toString());
-
- return m_luceneWriter;
- }
-
-
-
- @Override
- protected void init()
- {
- Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Will write crawled data into " + m_luceneWriter.getDirectory().toString());
-
- ensureConsumerThreadsRunning();
- }
-
-
-
-
-
-
-
}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/Buzzwords.java b/src/main/java/de/dfki/km/leech/lucene/basic/Buzzwords.java
new file mode 100644
index 0000000..8ed2e52
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/Buzzwords.java
@@ -0,0 +1,954 @@
+package de.dfki.km.leech.lucene.basic;
+
+
+
+import de.dfki.inquisitor.collections.MultiValueTreeMap;
+// import de.dfki.inquisitor.lucene.DynamicFieldType;
+// import de.dfki.inquisitor.lucene.*;
+import de.dfki.inquisitor.text.Levenshtein;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.*;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.similarities.ClassicSimilarity;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.util.*;
+import java.util.Map.Entry;
+
+
+
+/**
+ * The class Buzzwords extracts keywords out of documents - these can be in the form of lucene-documents, which enables to calculate the buzzwords very fast because the
+ * most information is still in the lucene index. But also strings can be processed, with an index as a base for calculation
+ *
+ * @author Christian Reuschling, Elisabeth Wolf
+ *
+ */
+public class Buzzwords
+{
+
+
+ static protected ClassicSimilarity m_defaultSimilarity = new ClassicSimilarity();
+
+
+
+ //
+ // /**
+ // * Adds calculated buzzwords to the given document. The method makes use of the IndexAccessor default Analyzer.
+ // *
+ // * @param doc2modify the document that should enriched with a new buzzword field
+ // * @param strIdFieldName the attribute name that should be used to identify the documents according to their id String
+ // * @param strNewField4Buzzwords the attribute that should be created for the buzzword. Becomes part of the document object
+ // * @param sAttNames4BuzzwordCalculation the attributes that should be considered for buzzword generation
+ // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords the method should generate
+ // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability
+ // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document
+ // * object value of strIdFieldName.
+ // *
+ // * @return true in the case the document object was modified, false otherwise. The method do not modify the index entry
+ // *
+ // * @throws Exception
+ // */
+ // static public boolean addBuzzwords(Document doc2modify, String strIdFieldName, String strNewField4Buzzwords, Set sAttNames4BuzzwordCalculation,
+ // int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws Exception
+ // {
+ //
+ //
+ // String strDocID = getAttributeValue(doc2modify, strIdFieldName);
+ // List lBuzzwords = getBuzzwords(strDocID, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, bSkipSimilarTerms, hsIndexPaths);
+ //
+ // // wenn es keinen Content gibt, mache mer gar nix
+ // if(lBuzzwords == null) return false;
+ //
+ // StringBuilder strbBuzzWordz = new StringBuilder();
+ //
+ // for (int i = 0; i < Math.min(iMaxNumberOfBuzzwords, lBuzzwords.size()); i++)
+ // strbBuzzWordz.append(lBuzzwords.get(i)).append(" ");
+ //
+ //
+ // // wenn es das Buzzword-feld schon gibt, wirds gelöscht
+ // doc2modify.removeFields(strNewField4Buzzwords);
+ // // die neu berechneten Buzzwords werden zum Doc hinzugefügt
+ // doc2modify.add(new TextWithTermVectorOffsetsField(strNewField4Buzzwords, strbBuzzWordz.toString()));
+ //
+ //
+ // return true;
+ // }
+
+
+
+ /**
+ * Gets the value of an attribute inside the document as String.
+ *
+ * @param doc
+ * @param strFieldName the attributes name
+ *
+ * @return the first attribute value under the given attribute name
+ */
+ private static String getAttributeValue(Document doc, String strFieldName)
+ {
+
+ IndexableField docAtt = doc.getField(strFieldName);
+ if(docAtt == null) return null;
+
+
+ return docAtt.stringValue();
+ }
+
+
+
+ //
+ // /**
+ // * Gets the buzzwords for fields of a document. The metohd makes use of the IndexAccessor default Analyzer.
+ // *
+ // * @param strDocID the ID of the document from which the buzzwords should be extracted
+ // * @param sAttNames4BuzzwordCalculation the name of the attributes the buzzwords should be extracted from
+ // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords
+ // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability
+ // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document
+ // * object value of strIdFieldName.
+ // *
+ // * @return the list of the extracted buzzwords, null in the case the given attribute doesn't exist
+ // *
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // * @throws URINotFoundException
+ // * @throws URISyntaxException
+ // */
+ // static public List getBuzzwords(String strDocID, Set sAttNames4BuzzwordCalculation, int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms,
+ // LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, URINotFoundException, URISyntaxException
+ // {
+ //
+ // LinkedHashMap buzzwordsWithTfIdf =
+ // getBuzzwordsWithTfIdf(strDocID, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, bSkipSimilarTerms, hsIndexPaths);
+ //
+ // LinkedList llBuzzwords = new LinkedList(buzzwordsWithTfIdf.keySet());
+ //
+ //
+ // return llBuzzwords;
+ // }
+
+
+ //
+ //
+ // /**
+ // * Gets the buzzwords for fields of a document. The metohd makes use of the IndexAccessor default Analyzer.
+ // *
+ // * @param strDocID the ID of the document from which the buzzwords should be extracted
+ // * @param strFieldName the name of the attribute the buzzwords should be extracted from
+ // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords
+ // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability
+ // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document
+ // * object value of strIdFieldName.
+ // *
+ // * @return the list of the extracted buzzwords, null in the case the given attribute doesn't exist
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // * @throws URINotFoundException
+ // * @throws URISyntaxException
+ // */
+ // static public List> getBuzzwords4AllFieldValues(String strDocID, String strFieldName, int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms,
+ // LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, URINotFoundException, URISyntaxException
+ // {
+ //
+ // List> buzzwordsWithTfIdfMaps =
+ // getBuzzwordsWithTfIdf4AllFieldValues(strDocID, strFieldName, iMaxNumberOfBuzzwords, bSkipSimilarTerms, hsIndexPaths);
+ //
+ // LinkedList> llbuzzwords4AllFieldValues = new LinkedList>();
+ // for (LinkedHashMap hsBuzzwords2TfIdf : buzzwordsWithTfIdfMaps)
+ // {
+ //
+ // LinkedList llBuzzwords = new LinkedList(hsBuzzwords2TfIdf.keySet());
+ //
+ // llbuzzwords4AllFieldValues.add(llBuzzwords);
+ // }
+ //
+ //
+ // return llbuzzwords4AllFieldValues;
+ // }
+
+
+ //
+ //
+ // /**
+ // * Gets the buzzwords for fields of a document, together with their document TfIdf value. The metohd makes use of the IndexAccessor default Analyzer.
+ // *
+ // * @param strDocID the ID of the document from which the buzzwords should be extracted
+ // * @param sAttNames4BuzzwordCalculation the name of the attributes the buzzwords should be extracted from.
+ // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords
+ // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability
+ // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document
+ // * object value of strIdFieldName.
+ // *
+ // * @return the extracted buzzwords, boosted according their score. Key: the term itself. Value: the according score. null in the case the given attribute doesn't
+ // * exist.
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // * @throws URINotFoundException
+ // * @throws URISyntaxException
+ // */
+ // static public LinkedHashMap getBuzzwordsWithTfIdf(String strDocID, Set sAttNames4BuzzwordCalculation, int iMaxNumberOfBuzzwords,
+ // boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, URINotFoundException, URISyntaxException
+ // {
+ //
+ // MultiValueTreeMap tmScore2Term =
+ // retrieveInterestingTerms(strDocID, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, 2, 1, 2, bSkipSimilarTerms, hsIndexPaths);
+ //
+ // if(tmScore2Term.valueSize() < iMaxNumberOfBuzzwords)
+ // {
+ //
+ // MultiValueTreeMap tmScore2TermWeak =
+ // retrieveInterestingTerms(strDocID, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, 1, 1, 2, bSkipSimilarTerms, hsIndexPaths);
+ //
+ // while (tmScore2TermWeak.keySize() > 0)
+ // {
+ // Float fTfIdf = tmScore2TermWeak.firstKey();
+ // String strTopTerm = tmScore2TermWeak.getFirst(fTfIdf);
+ // tmScore2TermWeak.remove(fTfIdf, strTopTerm);
+ //
+ // if(!tmScore2Term.containsValue(strTopTerm)) tmScore2Term.add(fTfIdf, strTopTerm);
+ //
+ // if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords) break;
+ // }
+ // }
+ //
+ // LinkedHashMap hsTerm2TfIdf = new LinkedHashMap();
+ // for (Entry score2term : tmScore2Term.entryList())
+ // hsTerm2TfIdf.put(score2term.getValue(), score2term.getKey());
+ //
+ //
+ // return hsTerm2TfIdf;
+ // }
+
+
+
+ //
+ // /**
+ // * This method is for calculating buzzwords out of an arbritrary String, by giving an index attribute as 'context. The string will be tokenized according the given
+ // * analyzer for this attribute (as set by the IndexAccessor default analyzer), and also takes the document frequencies for all terms of this attribute.
+ // *
+ // * @param strDocumentText the text of the document. This text influences the buzzword calculation as it would be an attribute value of
+ // * strAttributeName4BuzzwordCalculation
+ // * @param strAttributeName4BuzzwordCalculation this is the name of the attribute the given text should be differentiated against with buzzwords
+ // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords
+ // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability
+ // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document
+ // * object value of strIdFieldName.
+ // *
+ // * @return the extracted buzzwords, with their according tfidf value, sorted by TfIdf values. Key: the term itself. Value: the tfIdf value.
+ // *
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // * @throws URINotFoundException
+ // * @throws URISyntaxException
+ // */
+ // static public LinkedHashMap getBuzzwordsWithTfIdf(String strDocumentText, String strAttributeName4BuzzwordCalculation, int iMaxNumberOfBuzzwords,
+ // boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, URINotFoundException, URISyntaxException
+ // {
+ // MultiValueTreeMap tmScore2Term =
+ // retrieveInterestingTerms(strDocumentText, strAttributeName4BuzzwordCalculation, iMaxNumberOfBuzzwords, 2, 1, 2, bSkipSimilarTerms, hsIndexPaths);
+ //
+ // if(tmScore2Term.valueSize() < iMaxNumberOfBuzzwords)
+ // {
+ //
+ // MultiValueTreeMap tmScore2TermWeak =
+ // retrieveInterestingTerms(strDocumentText, strAttributeName4BuzzwordCalculation, iMaxNumberOfBuzzwords, 1, 1, 2, bSkipSimilarTerms, hsIndexPaths);
+ //
+ // while (tmScore2TermWeak.keySize() > 0)
+ // {
+ // Float fTfIdf = tmScore2TermWeak.firstKey();
+ // String strTopTerm = tmScore2TermWeak.getFirst(fTfIdf);
+ // tmScore2TermWeak.remove(fTfIdf, strTopTerm);
+ //
+ // if(!tmScore2Term.containsValue(strTopTerm)) tmScore2Term.add(fTfIdf, strTopTerm);
+ //
+ // if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords) break;
+ // }
+ // }
+ //
+ // LinkedHashMap hsTerm2TfIdf = new LinkedHashMap();
+ // for (Entry score2term : tmScore2Term.entryList())
+ // hsTerm2TfIdf.put(score2term.getValue(), score2term.getKey());
+ //
+ //
+ // return hsTerm2TfIdf;
+ //
+ // }
+
+
+
+ // /**
+ // * Gets the buzzwords for fields of a document, together with their document TfIdf value. The metohd makes use of the IndexAccessor default Analyzer.
+ // *
+ // * @param strDocID the ID of the document from which the buzzwords should be extracted
+ // * @param strFieldName the name of the attribute the buzzwords should be extracted from.
+ // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords
+ // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability
+ // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document
+ // * object value of strIdFieldName.
+ // *
+ // * @return the extracted buzzwords, boosted according their score. Key: the term itself. Value: the according score. null in the case the given attribute doesn't
+ // * exist.
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // * @throws URINotFoundException
+ // * @throws URISyntaxException
+ // */
+ // static public List> getBuzzwordsWithTfIdf4AllFieldValues(String strDocID, String strFieldName, int iMaxNumberOfBuzzwords,
+ // boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, URINotFoundException, URISyntaxException
+ // {
+ //
+ // List> tmScore2TermMaps =
+ // retrieveInterestingTerms4AllFieldValues(strDocID, strFieldName, iMaxNumberOfBuzzwords, 2, 1, 2, bSkipSimilarTerms, hsIndexPaths);
+ //
+ // // aus Performancegründen verzichte ich hier mal auf eine 'weichere' Strategie, falls unsere Maximalanzahl der Buzzwords nicht erreicht wurde
+ //
+ // LinkedList> hsTerm2ScoreMaps = new LinkedList>();
+ //
+ // for (MultiValueTreeMap hsScore2Term : tmScore2TermMaps)
+ // {
+ // LinkedHashMap hsTerm2TfIdf = new LinkedHashMap();
+ // for (Entry score2term : hsScore2Term.entryList())
+ // hsTerm2TfIdf.put(score2term.getValue(), score2term.getKey());
+ //
+ // hsTerm2ScoreMaps.add(hsTerm2TfIdf);
+ // }
+ //
+ //
+ // return hsTerm2ScoreMaps;
+ // }
+
+
+
+
+ /**
+ * Adds calculated buzzwords to the given document. The method makes use of the IndexAccessor default Analyzer.
+ *
+ * @param iDocNo the lucene document number inside the index behind reader, for the document doc2modify
+ * @param doc2modify the document that should enriched with a new buzzword field
+ * @param strNewField4Buzzwords the attribute that should be created for the buzzword. Becomes part of the document object
+ * @param sAttNames4BuzzwordCalculation the attributes that should be considered for buzzword generation
+ * @param iMaxNumberOfBuzzwords the maximum number of buzzwords the method should generate
+ * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability
+ * @param reader the lucene index reader
+ *
+ * @return true in the case the document object was modified, false otherwise. The method do not modify the index entry
+ *
+ * @throws Exception
+ */
+ static public boolean addBuzzwords(int iDocNo, Document doc2modify, String strNewField4Buzzwords, Set sAttNames4BuzzwordCalculation,
+ int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms, IndexReader reader) throws Exception
+ {
+
+
+ List lBuzzwords = getBuzzwords(iDocNo, doc2modify, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, bSkipSimilarTerms, reader);
+
+ // wenn es keinen Content gibt, mache mer gar nix
+ if(lBuzzwords == null) return false;
+
+ StringBuilder strbBuzzWordz = new StringBuilder();
+
+ for (int i = 0; i < Math.min(iMaxNumberOfBuzzwords, lBuzzwords.size()); i++)
+ strbBuzzWordz.append(lBuzzwords.get(i)).append(" ");
+
+
+ // wenn es das Buzzword-feld schon gibt, wirds gelöscht
+ doc2modify.removeFields(strNewField4Buzzwords);
+ // die neu berechneten Buzzwords werden zum Doc hinzugefügt
+ FieldType fieldType =
+ new DynamicFieldType().setIndexOptionS(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS).setStoreD(true).setStoreTermVectorS(true)
+ .setStoreTermVectorOffsetS(true).setTokenizeD(true).freezE();
+
+ Field field4buzzwords = new Field(strNewField4Buzzwords, strbBuzzWordz.toString(), fieldType);
+ doc2modify.add(field4buzzwords);
+
+
+ return true;
+ }
+
+
+
+ static protected int docID2DocNo(String strDocIdAttributeName, String strDocID, IndexReader reader) throws Exception
+ {
+ int luceneDocumentNumber;
+
+ IndexSearcher searcher = new IndexSearcher(reader);
+
+ TopDocs topDocs = searcher.search(new TermQuery(new Term(strDocIdAttributeName, strDocID)), 1);
+
+ if(topDocs.totalHits == 0) throw new Exception("no lucene document found with id '" + strDocID + "'");
+
+ // es sollte lediglich ein Dokument mit dieser id aufzufinden sein...
+ luceneDocumentNumber = topDocs.scoreDocs[0].doc;
+
+ return luceneDocumentNumber;
+ }
+
+
+
+
+
+
+
+ /**
+ * Gets the buzzwords for fields of a document. The metohd makes use of the IndexAccessor default Analyzer.
+ *
+ * @param iDocNo the lucene document number inside the index behind reader, for the document doc2modify
+ * @param doc2modify the document that should enriched with a new buzzword field
+ * @param sAttNames4BuzzwordCalculation the name of the attributes the buzzwords should be extracted from
+ * @param iMaxNumberOfBuzzwords the maximum number of buzzwords
+ * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability
+ * @param reader the lucene index reader
+ *
+ * @return the list of the extracted buzzwords, null in the case the given attribute doesn't exist
+ *
+ * @throws Exception
+ * @throws URINotFoundException
+ */
+ static public List getBuzzwords(int iDocNo, Document doc2modify, Set sAttNames4BuzzwordCalculation, int iMaxNumberOfBuzzwords,
+ boolean bSkipSimilarTerms, IndexReader reader) throws Exception
+ {
+
+ LinkedHashMap buzzwordsWithTfIdf =
+ getBuzzwordsWithTfIdf(iDocNo, doc2modify, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, bSkipSimilarTerms, reader);
+
+ LinkedList llBuzzwords = new LinkedList(buzzwordsWithTfIdf.keySet());
+
+
+ return llBuzzwords;
+ }
+
+
+
+ /**
+ * Gets the buzzwords for fields of a document, together with their document TfIdf value. The metohd makes use of the IndexAccessor default Analyzer.
+ *
+ * @param iDocNo the lucene document number inside the index behind reader, for the document doc2modify
+ * @param doc2modify the document that should enriched with a new buzzword field
+ * @param sAttNames4BuzzwordCalculation the name of the attributes the buzzwords should be extracted from.
+ * @param iMaxNumberOfBuzzwords the maximum number of buzzwords
+ * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability
+ * @param reader the lucene index reader
+ *
+ * @return the extracted buzzwords, boosted according their score. Key: the term itself. Value: the according score. null in the case the given attribute doesn't
+ * exist.
+ *
+ * @throws Exception
+ */
+ static public LinkedHashMap getBuzzwordsWithTfIdf(int iDocNo, Document doc2modify, Set sAttNames4BuzzwordCalculation,
+ int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms, IndexReader reader) throws Exception
+ {
+
+ MultiValueTreeMap tmScore2Term =
+ retrieveInterestingTerms(iDocNo, doc2modify, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, 2, 1, 2, bSkipSimilarTerms, reader);
+
+ if(tmScore2Term.valueSize() < iMaxNumberOfBuzzwords)
+ {
+
+ MultiValueTreeMap tmScore2TermWeak =
+ retrieveInterestingTerms(iDocNo, doc2modify, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, 1, 1, 2, bSkipSimilarTerms, reader);
+
+ while (tmScore2TermWeak.keySize() > 0)
+ {
+ Float fTfIdf = tmScore2TermWeak.firstKey();
+ String strTopTerm = tmScore2TermWeak.getFirst(fTfIdf);
+ tmScore2TermWeak.remove(fTfIdf, strTopTerm);
+
+ if(!tmScore2Term.containsValue(strTopTerm)) tmScore2Term.add(fTfIdf, strTopTerm);
+
+ if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords) break;
+ }
+ }
+
+ LinkedHashMap hsTerm2TfIdf = new LinkedHashMap();
+ for (Entry score2term : tmScore2Term.entryList())
+ hsTerm2TfIdf.put(score2term.getValue(), score2term.getKey());
+
+
+ return hsTerm2TfIdf;
+ }
+
+
+
+ /**
+ *
+ * @param iDocNo the lucene document number inside the index behind reader, for the document doc2modify
+ * @param doc2modify the document that should enriched with a new buzzword field
+ * @param strFieldName the field where you want the top frequent terms for.
+ * @param iMinFrequency the minimum frequency a term must appear in this field
+ * @param iMinWordLength the minimum word length a term must have
+ * @param iMaxNumberOfTerms the maximum number of terms the method returns
+ * @param reader the lucene index reader
+ *
+ * @return
+ *
+ * @throws Exception
+ */
+ public static List getTopFrequentTerms(int iDocNo, Document doc2modify, String strFieldName, int iMinFrequency, int iMinWordLength,
+ int iMaxNumberOfTerms, IndexReader reader) throws Exception
+ {
+
+ LinkedList llTerm2Frequency = new LinkedList();
+ PriorityQueue pqTerm2Frequency = new PriorityQueue(iMaxNumberOfTerms, new Comparator()
+ {
+
+ @Override
+ public int compare(Term2FrequencyEntry o1, Term2FrequencyEntry o2)
+ {
+ return o1.getFrequency().compareTo(o2.getFrequency());
+ }
+ });
+
+ // wenn es das feld gar nicht gibt in diesem doc, dann machen wir gar nix! (das überprüfen ist erheblich billiger als das unnötige iterieren durch alles im reader
+ if(doc2modify.getField(strFieldName) == null) return llTerm2Frequency;
+
+ Terms termVector = reader.getTermVector(iDocNo, strFieldName);
+ if(termVector == null) return llTerm2Frequency;
+
+ TermsEnum termsEnum = termVector.iterator();
+
+ while (termsEnum.next() != null)
+ {
+ String strTerm = termsEnum.term().utf8ToString();
+ long lFrequency = termsEnum.totalTermFreq();
+
+ if(lFrequency >= iMinFrequency && strTerm.length() >= iMinWordLength)
+ pqTerm2Frequency.add(new Term2FrequencyEntry(strTerm, Long.valueOf(lFrequency).intValue()));
+
+ if(pqTerm2Frequency.size() > iMaxNumberOfTerms) pqTerm2Frequency.poll();
+ }
+
+ for (Term2FrequencyEntry term2Frq : pqTerm2Frequency)
+ llTerm2Frequency.add(0, term2Frq);
+
+
+
+ return llTerm2Frequency;
+ }
+
+
+
+ static MultiValueTreeMap retrieveInterestingTerms(int iDocNo, Document doc2modify, Set sAttNames4BuzzwordCalculation,
+ int iMaxNumberOfBuzzwords, int iMinDocFreq, int iMinTermFreq, int iMinWordLen, boolean bSkipSimilarTerms, IndexReader reader) throws Exception
+ {
+
+ int iIndexDocumentCount = reader.numDocs();
+
+ HashMap hsTerm2Frequency = new HashMap();
+
+ // als erstes werden die frequencies aller fields aufsummiert
+ for (String strFieldName : sAttNames4BuzzwordCalculation)
+ {
+
+ // XXX: hier ist erst mal die Anzahl der verschiedenen Terme des docs hartkodiert
+ List topFrequentTerms = getTopFrequentTerms(iDocNo, doc2modify, strFieldName, iMinTermFreq, iMinWordLen, 1234, reader);
+
+ for (Term2FrequencyEntry topTerm2FreqLocal : topFrequentTerms)
+ {
+ Integer iFreqOld = hsTerm2Frequency.get(topTerm2FreqLocal.getTerm());
+ if(iFreqOld == null)
+ iFreqOld = topTerm2FreqLocal.getFrequency();
+ else
+ iFreqOld += topTerm2FreqLocal.getFrequency();
+
+ hsTerm2Frequency.put(topTerm2FreqLocal.getTerm(), iFreqOld);
+ }
+ }
+
+ // nun werden die Terme bezüglich ihres scores (tfIdf) sortiert
+ MultiValueTreeMap tmScore2Term = new MultiValueTreeMap(HashSet.class);
+ for (Entry term2Frequency : hsTerm2Frequency.entrySet())
+ {
+ String strTerm = term2Frequency.getKey();
+ Integer iTermFrequency = term2Frequency.getValue();
+
+ // wir haben angegeben, wie oft der Term mindestens da sein muß
+ if(iMinTermFreq > 0 && iTermFrequency < iMinTermFreq) continue;
+
+ // Zahlen ignorieren wir
+ if(!strTerm.matches("\\D+")) continue;
+
+ // es wird die max-docFrequency berücksichtig (wie in MoreLikeThis)
+ int iMaxDocumentFrequency = 0;
+ for (String strField : sAttNames4BuzzwordCalculation)
+ {
+ int iDocumentFrequency = reader.docFreq(new Term(strField, strTerm));
+ if(iMaxDocumentFrequency < iDocumentFrequency) iMaxDocumentFrequency = iDocumentFrequency;
+ }
+
+ if(iMinDocFreq > 0 && iMaxDocumentFrequency < iMinDocFreq) continue;
+
+ // das sollte eigentlich nicht passieren - im Fehlerfall ignorieren wir das einfach
+ if(iMaxDocumentFrequency == 0) continue;
+
+ // das ist die Formel der defaultSimilarity. Eine andere werden wir einfach nie brauchen
+ float fIdf = m_defaultSimilarity.idf(iMaxDocumentFrequency, iIndexDocumentCount);
+ float fScore = m_defaultSimilarity.tf(iTermFrequency) * fIdf * fIdf;
+
+ boolean bRemoveLastTerm4Score = false;
+ // nur die top -Terme - wenn wir über die max-Anzahl sind, dann tauschen wir den kleinsten aus
+ if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords)
+ {
+ // wir sind drüber
+ // wenn unser kleinster schon größer ist, dann ignorieren wir den neuen
+ if(tmScore2Term.firstKey() >= fScore) continue;
+ // ansonsten tauschen wir unseren kleinsten aus
+ bRemoveLastTerm4Score = true;
+ }
+
+
+ // wir schauen, ob wir schon einen term drin haben, der uns sehr ähnlich sieht - dann nehmen wir den mit dem höchsten score (alternativ
+ // wäre auch der kürzere möglich, aber der könnte einen niederen score haben, und dann später wieder rausfliegen - das würde die Qualität
+ // verschlechtern)
+ Boolean bBetterSimilarTermInList = false;
+ if(bSkipSimilarTerms)
+ {
+ for (Entry score2TermInList : tmScore2Term.entryList())
+ {
+ if(!Levenshtein.isInDistance(score2TermInList.getValue(), strTerm, 3)) continue;
+ // wenn der existierende größer ist, dann brauchen wir gar nix eintragen
+ if(score2TermInList.getKey() >= fScore)
+ {
+ bBetterSimilarTermInList = true;
+ break;
+ }
+ // wenn der neue vom score her besser ist, dann müssen wir den austauschen
+ tmScore2Term.remove(score2TermInList.getKey(), score2TermInList.getValue());
+ }
+ }
+
+ if(bRemoveLastTerm4Score && !bBetterSimilarTermInList) tmScore2Term.remove(tmScore2Term.firstKey());
+ if(!bBetterSimilarTermInList) tmScore2Term.add(fScore, strTerm);
+ }
+
+
+ return tmScore2Term;
+ }
+
+
+
+
+
+
+ // static MultiValueTreeMap retrieveInterestingTerms(String strDocID, Set sAttNames4BuzzwordCalculation, int iMaxNumberOfBuzzwords,
+ // int iMinDocFreq, int iMinTermFreq, int iMinWordLen, boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException,
+ // URINotFoundException, URISyntaxException
+ // {
+ //
+ // RemoteIndexReader reader = IndexAccessor.getMultiIndexReader(hsIndexPaths, true);
+ // int iIndexDocumentCount = reader.numDocs();
+ //
+ // HashMap hsTerm2Frequency = new HashMap();
+ //
+ // // als erstes werden die frequencies aller fields aufsummiert
+ // for (String strFieldName : sAttNames4BuzzwordCalculation)
+ // {
+ //
+ // // XXX: hier ist erst mal die Anzahl der verschiedenen Terme des docs hartkodiert
+ // List topFrequentTerms = reader.getTopFrequentTerms(strDocID, strFieldName, iMinTermFreq, iMinWordLen, 1234);
+ //
+ // for (Term2FrequencyEntry topTerm2FreqLocal : topFrequentTerms)
+ // {
+ // Integer iFreqOld = hsTerm2Frequency.get(topTerm2FreqLocal.getTerm());
+ // if(iFreqOld == null)
+ // iFreqOld = topTerm2FreqLocal.getFrequency();
+ // else
+ // iFreqOld += topTerm2FreqLocal.getFrequency();
+ //
+ // hsTerm2Frequency.put(topTerm2FreqLocal.getTerm(), iFreqOld);
+ // }
+ // }
+ //
+ // // nun werden die Terme bezüglich ihres scores (tfIdf) sortiert
+ // MultiValueTreeMap tmScore2Term = new MultiValueTreeMap(HashSet.class);
+ // for (Entry term2Frequency : hsTerm2Frequency.entrySet())
+ // {
+ // String strTerm = term2Frequency.getKey();
+ // Integer iTermFrequency = term2Frequency.getValue();
+ //
+ // // wir haben angegeben, wie oft der Term mindestens da sein muß
+ // if(iMinTermFreq > 0 && iTermFrequency < iMinTermFreq) continue;
+ //
+ // // Zahlen ignorieren wir
+ // if(!strTerm.matches("\\D+")) continue;
+ //
+ // // es wird die max-docFrequency berücksichtig (wie in MoreLikeThis)
+ // int iMaxDocumentFrequency = 0;
+ // for (String strField : sAttNames4BuzzwordCalculation)
+ // {
+ // int iDocumentFrequency = reader.documentFrequency(strField, strTerm);
+ // if(iMaxDocumentFrequency < iDocumentFrequency) iMaxDocumentFrequency = iDocumentFrequency;
+ // }
+ //
+ // if(iMinDocFreq > 0 && iMaxDocumentFrequency < iMinDocFreq) continue;
+ //
+ // // das sollte eigentlich nicht passieren - im Fehlerfall ignorieren wir das einfach
+ // if(iMaxDocumentFrequency == 0) continue;
+ //
+ // // das ist die Formel der defaultSimilarity. Eine andere werden wir einfach nie brauchen
+ // float fIdf = m_defaultSimilarity.idf(iMaxDocumentFrequency, iIndexDocumentCount);
+ // float fScore = m_defaultSimilarity.tf(iTermFrequency) * fIdf * fIdf;
+ //
+ // boolean bRemoveLastTerm4Score = false;
+ // // nur die top -Terme - wenn wir über die max-Anzahl sind, dann tauschen wir den kleinsten aus
+ // if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords)
+ // {
+ // // wir sind drüber
+ // // wenn unser kleinster schon größer ist, dann ignorieren wir den neuen
+ // if(tmScore2Term.firstKey() >= fScore) continue;
+ // // ansonsten tauschen wir unseren kleinsten aus
+ // bRemoveLastTerm4Score = true;
+ // }
+ //
+ //
+ // // wir schauen, ob wir schon einen term drin haben, der uns sehr ähnlich sieht - dann nehmen wir den mit dem höchsten score (alternativ
+ // // wäre auch der kürzere möglich, aber der könnte einen niederen score haben, und dann später wieder rausfliegen - das würde die Qualität
+ // // verschlechtern)
+ // Boolean bBetterSimilarTermInList = false;
+ // if(bSkipSimilarTerms)
+ // {
+ // for (Entry score2TermInList : tmScore2Term.entryList())
+ // {
+ // if(!Levenshtein.isInDistance(score2TermInList.getValue(), strTerm, 3)) continue;
+ // // wenn der existierende größer ist, dann brauchen wir gar nix eintragen
+ // if(score2TermInList.getKey() >= fScore)
+ // {
+ // bBetterSimilarTermInList = true;
+ // break;
+ // }
+ // // wenn der neue vom score her besser ist, dann müssen wir den austauschen
+ // tmScore2Term.remove(score2TermInList.getKey(), score2TermInList.getValue());
+ // }
+ // }
+ //
+ // if(bRemoveLastTerm4Score && !bBetterSimilarTermInList) tmScore2Term.remove(tmScore2Term.firstKey());
+ // if(!bBetterSimilarTermInList) tmScore2Term.add(fScore, strTerm);
+ // }
+ //
+ //
+ // return tmScore2Term;
+ // }
+
+
+ //
+ // /**
+ // * This method is for calculating buzzwords out of an arbritrary String, by giving an index attribute as 'context. The string will be tokenized according the given
+ // * analyzer for this attribute (as set by the IndexAccessor default analyzer), and also takes the document frequencies for all terms of this attribute.
+ // *
+ // * @param strDocumentText the text of the document. This text influences the buzzword calculation as it would be an attribute value of
+ // * strAttributeName4BuzzwordCalculation
+ // * @param strAttributeName4BuzzwordCalculation this is the name of the attribute the given text should be differentiated against with buzzwords
+ // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords
+ // * @param iMinDocFreq
+ // * @param iMinTermFreq
+ // * @param iMinWordLen
+ // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability
+ // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document
+ // * object value of strIdFieldName.
+ // *
+ // * @return the extracted buzzwords, sorted by their according tfidf value. Key: the tfIdf value. Value: the term.
+ // *
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // * @throws URINotFoundException
+ // * @throws URISyntaxException
+ // */
+ // static MultiValueTreeMap retrieveInterestingTerms(String strDocumentText, String strAttributeName4BuzzwordCalculation, int iMaxNumberOfBuzzwords,
+ // int iMinDocFreq, int iMinTermFreq, int iMinWordLen, boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException,
+ // URINotFoundException, URISyntaxException
+ // {
+ //
+ // RemoteIndexReader reader = IndexAccessor.getMultiIndexReader(hsIndexPaths, true);
+ // int iIndexDocumentCount = reader.numDocs();
+ //
+ // // hier tokenisieren wir den übergebenen Text und ermitteln die term frequencies
+ // HashMap hsTerm2Frequency = new HashMap();
+ //
+ // TokenStream tokenStream = IndexAccessor.getDefaultAnalyzer().tokenStream(strAttributeName4BuzzwordCalculation, strDocumentText);
+ //
+ // tokenStream.reset();
+ // while (tokenStream.incrementToken())
+ // {
+ // // hier ermitteln wir die termfrequenzen für das aktuelle AttValue
+ // CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
+ // String strTerm = termAttribute.toString();
+ //
+ // Integer iFrequency = hsTerm2Frequency.get(strTerm);
+ // if(iFrequency == null)
+ // hsTerm2Frequency.put(strTerm, 1);
+ // else
+ // hsTerm2Frequency.put(strTerm, iFrequency + 1);
+ // }
+ // tokenStream.close();
+ //
+ //
+ //
+ // // nun werden die Terme bezüglich ihres scores (tfIdf) sortiert
+ // MultiValueTreeMap tmScore2Term = new MultiValueTreeMap(HashSet.class);
+ // for (Entry term2Frequency : hsTerm2Frequency.entrySet())
+ // {
+ // String strTerm = term2Frequency.getKey();
+ // Integer iTermFrequency = term2Frequency.getValue();
+ //
+ //
+ // if(strTerm.length() < iMinWordLen) continue;
+ // // wir haben angegeben, wie oft der Term mindestens da sein muß
+ // if(iMinTermFreq > 0 && iTermFrequency < iMinTermFreq) continue;
+ //
+ // // Zahlen ignorieren wir
+ // if(!strTerm.matches("\\D+")) continue;
+ //
+ // int iDocumentFrequency = reader.documentFrequency(strAttributeName4BuzzwordCalculation, strTerm);
+ //
+ // if(iMinDocFreq > 0 && iDocumentFrequency < iMinDocFreq) continue;
+ //
+ // // das sollte eigentlich nicht passieren - im Fehlerfall ignorieren wir das einfach
+ // if(iDocumentFrequency == 0) continue;
+ //
+ // // das ist die Formel der defaultSimilarity. Eine andere werden wir einfach nie brauchen
+ // float fIdf = m_defaultSimilarity.idf(iDocumentFrequency, iIndexDocumentCount);
+ // float fScore = m_defaultSimilarity.tf(iTermFrequency) * fIdf * fIdf;
+ //
+ // boolean bRemoveLastTerm4Score = false;
+ // // nur die top -Terme - wenn wir über die max-Anzahl sind, dann tauschen wir den kleinsten aus
+ // if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords)
+ // {
+ // // wir sind drüber
+ // // wenn unser kleinster schon größer ist, dann ignorieren wir den neuen
+ // if(tmScore2Term.firstKey() >= fScore) continue;
+ // // ansonsten tauschen wir unseren kleinsten aus
+ // bRemoveLastTerm4Score = true;
+ // }
+ //
+ //
+ // // wir schauen, ob wir schon einen term drin haben, der uns sehr ähnlich sieht - dann nehmen wir den mit dem höchsten score (alternativ
+ // // wäre auch der kürzere möglich, aber der könnte einen niederen score haben, und dann später wieder rausfliegen - das würde die Qualität
+ // // verschlechtern)
+ // Boolean bBetterSimilarTermInList = false;
+ // if(bSkipSimilarTerms)
+ // {
+ // for (Entry score2TermInList : tmScore2Term.entryList())
+ // {
+ // if(!Levenshtein.isInDistance(score2TermInList.getValue(), strTerm, 3)) continue;
+ // // wenn der existierende größer ist, dann brauchen wir gar nix eintragen
+ // if(score2TermInList.getKey() >= fScore)
+ // {
+ // bBetterSimilarTermInList = true;
+ // break;
+ // }
+ // // wenn der neue vom score her besser ist, dann müssen wir den austauschen
+ // tmScore2Term.remove(score2TermInList.getKey(), score2TermInList.getValue());
+ // }
+ // }
+ //
+ // if(bRemoveLastTerm4Score && !bBetterSimilarTermInList) tmScore2Term.remove(tmScore2Term.firstKey());
+ // if(!bBetterSimilarTermInList) tmScore2Term.add(fScore, strTerm);
+ // }
+ //
+ //
+ //
+ // return tmScore2Term;
+ // }
+
+ //
+ //
+ // static List> retrieveInterestingTerms4AllFieldValues(String strDocID, String strFieldName, int iMaxNumberOfBuzzwords,
+ // int iMinDocFreq, int iMinTermFreq, int iMinWordLen, boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException,
+ // URINotFoundException, URISyntaxException
+ // {
+ //
+ // RemoteIndexReader reader = IndexAccessor.getMultiIndexReader(hsIndexPaths, true);
+ // int iIndexDocumentCount = reader.numDocs();
+ //
+ //
+ // LinkedList> llScore2TermMaps = new LinkedList>();
+ //
+ // // XXX: hier ist erst mal die Anzahl der verschiedenen Terme des docs hartkodiert
+ // for (List lTerm2Frequencies : reader.getTopFrequentTermsPerAttributeValue(strDocID, strFieldName, iMinTermFreq, iMinWordLen, 1234))
+ // {
+ //
+ // // nun werden die Terme bezüglich ihres scores (tfIdf) sortiert
+ // MultiValueTreeMap tmScore2Term = new MultiValueTreeMap(HashSet.class);
+ // for (Term2FrequencyEntry term2Frequency : lTerm2Frequencies)
+ // {
+ // String strTerm = term2Frequency.getTerm();
+ // Integer iTermFrequency = term2Frequency.getFrequency();
+ //
+ // // wir haben angegeben, wie oft der Term mindestens da sein muß
+ // if(iMinTermFreq > 0 && iTermFrequency < iMinTermFreq) continue;
+ //
+ // // Zahlen ignorieren wir
+ // if(!strTerm.matches("\\D+")) continue;
+ //
+ // int iDocumentFrequency = reader.documentFrequency(strFieldName, strTerm);
+ //
+ // if(iMinDocFreq > 0 && iDocumentFrequency < iMinDocFreq) continue;
+ //
+ // // das sollte eigentlich nicht passieren - im Fehlerfall ignorieren wir das einfach
+ // if(iDocumentFrequency == 0) continue;
+ //
+ // // das ist die Formel der defaultSimilarity. Eine andere werden wir einfach nie brauchen
+ // float fIdf = m_defaultSimilarity.idf(iDocumentFrequency, iIndexDocumentCount);
+ // float fScore = m_defaultSimilarity.tf(iTermFrequency) * fIdf * fIdf;
+ //
+ // boolean bRemoveLastTerm4Score = false;
+ // // nur die top -Terme - wenn wir über die max-Anzahl sind, dann tauschen wir den kleinsten aus
+ // if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords)
+ // {
+ // // wir sind drüber
+ // // wenn unser kleinster schon größer ist, dann ignorieren wir den neuen
+ // if(tmScore2Term.firstKey() >= fScore) continue;
+ // // ansonsten tauschen wir unseren kleinsten aus
+ // bRemoveLastTerm4Score = true;
+ // }
+ //
+ //
+ // // wir schauen, ob wir schon einen term drin haben, der uns sehr ähnlich sieht - dann nehmen wir den mit dem höchsten score
+ // // (alternativ
+ // // wäre auch der kürzere möglich, aber der könnte einen niederen score haben, und dann später wieder rausfliegen - das würde die
+ // // Qualität
+ // // verschlechtern)
+ // Boolean bBetterSimilarTermInList = false;
+ // if(bSkipSimilarTerms)
+ // {
+ // for (Entry score2TermInList : tmScore2Term.entryList())
+ // {
+ // if(!Levenshtein.isInDistance(score2TermInList.getValue(), strTerm, 3)) continue;
+ // // wenn der existierende größer ist, dann brauchen wir gar nix eintragen
+ // if(score2TermInList.getKey() >= fScore)
+ // {
+ // bBetterSimilarTermInList = true;
+ // break;
+ // }
+ // // wenn der neue vom score her besser ist, dann müssen wir den austauschen
+ // tmScore2Term.remove(score2TermInList.getKey(), score2TermInList.getValue());
+ // }
+ // }
+ //
+ // if(bRemoveLastTerm4Score && !bBetterSimilarTermInList) tmScore2Term.remove(tmScore2Term.firstKey());
+ // if(!bBetterSimilarTermInList) tmScore2Term.add(fScore, strTerm);
+ // }
+ //
+ // llScore2TermMaps.add(tmScore2Term);
+ // }
+ //
+ //
+ //
+ // return llScore2TermMaps;
+ // }
+
+
+
+
+
+
+
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/DocumentFrqClass.java b/src/main/java/de/dfki/km/leech/lucene/basic/DocumentFrqClass.java
new file mode 100644
index 0000000..bf0fab3
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/DocumentFrqClass.java
@@ -0,0 +1,179 @@
+package de.dfki.km.leech.lucene.basic;
+
+
+
+// import de.dfki.inquisitor.lucene.DynamicFieldType;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.mapdb.DB;
+import org.mapdb.DBMaker;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+
+
+public class DocumentFrqClass implements Closeable
+{
+
+
+ protected Map m_hsTerm2IndexFrq;
+
+ protected long m_lMaxFrq = 0;
+
+ protected DB m_mapDB;
+
+ protected IndexReader m_reader;
+
+ protected String m_strFieldName4Calculation;
+
+ protected String m_strMaxFrqTerm = "";
+
+
+
+ @SuppressWarnings("unchecked")
+ public DocumentFrqClass(IndexReader reader, String strFieldName4Calculation)
+ {
+ m_reader = reader;
+ m_strFieldName4Calculation = strFieldName4Calculation;
+
+ try
+ {
+ Logger.getLogger(DocumentFrqClass.class.getName()).info("load overall term index frequencies");
+
+
+ // OLD: m_mapDB = DBMaker.newTempFileDB().deleteFilesAfterClose().closeOnJvmShutdown().transactionDisable().make();
+ // m_hsTerm2IndexFrq = m_mapDB.getTreeMap("temp");
+ m_mapDB = DBMaker.tempFileDB().closeOnJvmShutdown().fileDeleteAfterOpen().fileDeleteAfterClose().fileLockDisable().fileMmapEnableIfSupported().make();
+ m_hsTerm2IndexFrq = (Map) m_mapDB.treeMap("temp").create();
+
+
+
+ Terms terms;
+
+ terms = MultiFields.getTerms(reader, strFieldName4Calculation);
+
+
+ if(terms != null)
+ {
+ TermsEnum termsEnum = terms.iterator();
+
+ while (termsEnum.next() != null)
+ {
+ long lFrequency = termsEnum.totalTermFreq();
+ String strTerm = termsEnum.term().utf8ToString();
+
+ m_hsTerm2IndexFrq.put(strTerm, lFrequency);
+ if(lFrequency > m_lMaxFrq)
+ {
+ m_lMaxFrq = lFrequency;
+ m_strMaxFrqTerm = strTerm;
+ }
+ }
+ }
+
+
+ Logger.getLogger(DocumentFrqClass.class.getName()).info("...finished");
+
+ }
+ catch (Throwable e)
+ {
+ Logger.getLogger(DocumentFrqClass.class.getName()).log(Level.SEVERE, "Error", e);
+ }
+
+ }
+
+
+
+ public boolean addDocumentFrequencyClass(int iDocNo, Document doc2modify, String strNewField4FrqClass) throws Exception
+ {
+
+ boolean bModified = false;
+ if(doc2modify.getField(strNewField4FrqClass) != null) bModified = true;
+
+ doc2modify.removeFields(strNewField4FrqClass);
+
+ if(doc2modify.getField(m_strFieldName4Calculation) == null) return bModified;
+
+
+ double dAverageFrqClass = 0;
+ int iFrqClassesCount = 0;
+
+
+
+ Terms termVector = m_reader.getTermVector(iDocNo, m_strFieldName4Calculation);
+ if(termVector == null) return bModified;
+
+ TermsEnum termsEnum = termVector.iterator();
+
+ while (termsEnum.next() != null)
+ {
+ String strTerm = termsEnum.term().utf8ToString();
+ // reine Zahlen sind draussen
+ if(strTerm.matches("\\d*")) continue;
+ // das zählt nur für dieses doc, siehe ApiDoc reader.getTermVector(..)
+ long lFrequencyInDoc = termsEnum.totalTermFreq();
+
+
+ Long lFrequencyInIndex = m_hsTerm2IndexFrq.get(strTerm);
+ if(lFrequencyInIndex == null) continue;
+
+ int iFrqClass;
+ if(m_lMaxFrq <= 0 || lFrequencyInIndex <= 0)
+ iFrqClass = -1;
+ else
+ iFrqClass = (int) Math.floor((Math.log((m_lMaxFrq / lFrequencyInIndex)) / Math.log(2)));
+
+ if(iFrqClass >= 2)
+ {
+ dAverageFrqClass += iFrqClass * lFrequencyInDoc;
+ iFrqClassesCount += lFrequencyInDoc;
+ }
+ }
+
+
+
+ if(iFrqClassesCount >= 0) dAverageFrqClass = dAverageFrqClass / iFrqClassesCount;
+
+ // wir diskretisieren auf halbe Werte
+ dAverageFrqClass = Math.round(dAverageFrqClass * 2);
+ // als Integer, ohne Nachkommastellen (der eigentliche Wert mal 10)
+ int iAverageFrqClass = (int) (dAverageFrqClass * 5d);
+
+
+
+ // und an das doc dran
+ FieldType fieldType =
+ new DynamicFieldType().setIndexOptionS(IndexOptions.DOCS).setStoreD(true).setStoreTermVectorS(true)
+ .setStoreTermVectorOffsetS(true).setTokenizeD(true).freezE();
+
+ Field field4buzzwords = new Field(strNewField4FrqClass, String.valueOf(iAverageFrqClass), fieldType);
+
+
+ doc2modify.add(field4buzzwords);
+
+
+ return true;
+ }
+
+
+
+ @Override
+ public void close() throws IOException
+ {
+ if(m_mapDB != null) m_mapDB.close();
+ m_mapDB = null;
+ m_hsTerm2IndexFrq = null;
+ m_reader = null;
+ }
+
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/DynamicFieldType.java b/src/main/java/de/dfki/km/leech/lucene/basic/DynamicFieldType.java
new file mode 100644
index 0000000..7434112
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/DynamicFieldType.java
@@ -0,0 +1,418 @@
+package de.dfki.km.leech.lucene.basic;
+
+
+
+import com.cedarsoftware.util.io.JsonReader;
+import com.cedarsoftware.util.io.JsonWriter;
+import de.dfki.inquisitor.text.DateParser;
+import de.dfki.inquisitor.text.DateUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.*;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.IndexOptions;
+
+import java.io.IOException;
+import java.util.Date;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+// import de.dfki.inquisitor.lucene.LuceneAnalyzerFactory;
+
+
+
+@SuppressWarnings("javadoc")
+public class DynamicFieldType extends FieldType
+{
+
+
+ public static final DynamicFieldType doubleFieldType = new DynamicFieldType(LegacyDoubleField.TYPE_STORED).freezE();
+
+ public static final DynamicFieldType floatFieldType = new DynamicFieldType(LegacyFloatField.TYPE_STORED).freezE();
+
+ public static final DynamicFieldType integerFieldType = new DynamicFieldType(LegacyIntField.TYPE_STORED).freezE();
+
+ public static final DynamicFieldType dateFieldType = new DynamicFieldType(LegacyLongField.TYPE_STORED).setDateParsing(true).freezE();
+
+ public static final DynamicFieldType keywordFieldType =
+ new DynamicFieldType().setTokenizeD(true).setStoreD(true).setStoreTermVectorS(true).setStoreTermVectorOffsetS(true)
+ .setIndexOptionS(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS).setOmitNormS(true).setAnalyzer("org.apache.lucene.analysis.core.KeywordAnalyzer")
+ .freezE();
+
+ public static final DynamicFieldType longFieldType = new DynamicFieldType(LegacyLongField.TYPE_STORED).freezE();
+
+ public static final DynamicFieldType tokenizedFieldType =
+ new DynamicFieldType().setTokenizeD(true).setStoreD(true).setStoreTermVectorS(true).setStoreTermVectorOffsetS(true)
+ .setIndexOptionS(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS).setAnalyzer("de.dfki.km.leech.lucene.LeechSimpleAnalyzer").freezE();
+
+
+
+ /**
+ * Create Field instances, according to the configuration inside the given fieldType. Number fields will be generated, if a string value is given, it will be
+ * converted in the case the fieldType is a number type. Further, the method parses Strings for date if the fieldtype is of type {@link DynamicFieldType} and
+ * configured accordingly. You can also give number values for generating number or String fields fields (also according to the given fieldType).
+ *
+ * @param strAttName the attributes name
+ * @param attValue the attributes value
+ * @param fieldType the field type that influences the returned type of the field
+ *
+ * @return the field, with the configured fieldType. Null in the case the Field can not be generated out of the value.
+ */
+ static public Field createField(String strAttName, Object attValue, FieldType fieldType)
+ {
+ try
+ {
+ if (attValue == null)
+ return null;
+
+
+ if (fieldType instanceof DynamicFieldType && ((DynamicFieldType) fieldType).getDateParsing() && attValue instanceof String)
+ {
+ Date parsedDate = DateParser.parseDateString((String) attValue);
+ if (parsedDate != null)
+ return new LegacyLongField(strAttName, DateUtils.date2Number(parsedDate), fieldType);
+ else
+ return null;
+ }
+ else if (attValue instanceof String)
+ {
+
+ if (fieldType.numericType() == LegacyNumericType.INT)
+ return new LegacyIntField(strAttName, Integer.valueOf((String) attValue), fieldType);
+ else if (fieldType.numericType() == LegacyNumericType.LONG)
+ return new LegacyLongField(strAttName, Long.valueOf((String) attValue), fieldType);
+ else if (fieldType.numericType() == LegacyNumericType.FLOAT)
+ return new LegacyFloatField(strAttName, Float.valueOf((String) attValue), fieldType);
+ else if (fieldType.numericType() == LegacyNumericType.DOUBLE)
+ return new LegacyDoubleField(strAttName, Double.valueOf((String) attValue), fieldType);
+ else
+ return new Field(strAttName, (String) attValue, fieldType);
+ }
+ else if (attValue instanceof Number)
+ {
+
+ if (fieldType.numericType() == LegacyNumericType.INT)
+ return new LegacyIntField(strAttName, ((Number) attValue).intValue(), fieldType);
+ else if (fieldType.numericType() == LegacyNumericType.LONG)
+ return new LegacyLongField(strAttName, ((Number) attValue).longValue(), fieldType);
+ else if (fieldType.numericType() == LegacyNumericType.FLOAT)
+ return new LegacyFloatField(strAttName, ((Number) attValue).floatValue(), fieldType);
+ else if (fieldType.numericType() == LegacyNumericType.DOUBLE)
+ return new LegacyDoubleField(strAttName, ((Number) attValue).doubleValue(), fieldType);
+ else
+ return new Field(strAttName, String.valueOf(attValue), fieldType);
+ }
+ else
+ return null;
+ } catch (Exception e)
+ {
+ Logger.getLogger(FieldConfig.class.getName()).log(Level.SEVERE, "Error", e);
+ return null;
+ }
+ }
+ protected String analyzer;
+ protected boolean dateParsing = false;
+
+ public DynamicFieldType()
+ {
+ super();
+ }
+
+
+
+ public DynamicFieldType(FieldType ref)
+ {
+ super(ref);
+ }
+
+
+
+ public Analyzer createAnalyzer()
+ {
+ try
+ {
+
+ return LuceneAnalyzerFactory.createAnalyzer(getAnalyzer(), null);
+ } catch (Exception e)
+ {
+ Logger.getLogger(DynamicFieldType.class.getName()).log(Level.SEVERE, "Error", e);
+ return null;
+ }
+ }
+
+
+
+ /**
+ * Create Field instances, according to the configuration inside the given fieldType. Number fields will be generated, if a string value is given, it will be
+ * converted in the case the fieldType is a number type. Further, the method parses Strings for date if the fieldtype is of type {@link DynamicFieldType} and
+ * configured accordingly. You can also give number values for generating number or String fields fields (also according to the given fieldType).
+ *
+ * @param strAttName the attributes name
+ * @param attValue the attributes value
+ *
+ * @return the field, with the configured fieldType. Null in the case the Field can not be generated out of the value.
+ */
+ public Field createField(String strAttName, Object attValue)
+ {
+ return createField(strAttName, attValue, this);
+ }
+
+
+
+ /**
+ * Same functionality as in upper class method, but returns this as sugar.
+ **/
+ public DynamicFieldType freezE()
+ {
+ super.freeze();
+
+ return this;
+ }
+
+
+
+ /**
+ * works only if this is not frozen yet
+ */
+ public void fromJson(String strJson)
+ {
+ try
+ {
+ DynamicFieldType ref = (DynamicFieldType) JsonReader.jsonToJava(strJson);
+
+ // this.setIndexed(ref.indexed());
+ this.setStored(ref.stored());
+ this.setTokenized(ref.tokenized());
+ this.setStoreTermVectors(ref.storeTermVectors());
+ this.setStoreTermVectorOffsets(ref.storeTermVectorOffsets());
+ this.setStoreTermVectorPositions(ref.storeTermVectorPositions());
+ this.setStoreTermVectorPayloads(ref.storeTermVectorPayloads());
+ this.setOmitNorms(ref.omitNorms());
+ this.setIndexOptions(ref.indexOptions());
+ this.setDocValuesType(ref.docValuesType());
+ this.setNumericType(ref.numericType());
+ this.setNumericPrecisionStep(ref.numericPrecisionStep());
+
+ this.setAnalyzer(ref.getAnalyzer());
+ } catch (IOException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+
+
+
+ /**
+ * Get the analyzer for this class. This is additionaly to the upper Lucene Fieldtype, for convinience. Returns this as sugar.
+ */
+ public String getAnalyzer()
+ {
+ return this.analyzer;
+ }
+
+
+
+ public boolean getDateParsing()
+ {
+ return dateParsing;
+ }
+
+
+
+ /**
+ * Set the analyzer for this class. The given String is the full class name of the analyzer, that can be used with Class.forName(..). This is additionaly to the upper
+ * Lucene Fieldtype, for convinience. Returns this as sugar.
+ */
+ public DynamicFieldType setAnalyzer(String analyzer)
+ {
+ this.analyzer = analyzer;
+
+ return this;
+ }
+
+
+
+ /**
+ * Specifies whether the values of this field should be parsed as date values or not. If true, all input strings will be parsed and written as according number into
+ * the index
+ *
+ * @return this as sugar
+ */
+ public DynamicFieldType setDateParsing(boolean enableDateParsing)
+ {
+ this.dateParsing = enableDateParsing;
+
+ return this;
+ }
+
+
+
+ /**
+ * Same functionality as in upper class method, but returns this as sugar.
+ **/
+ public DynamicFieldType setDocValuesTypE(DocValuesType type)
+ {
+ super.setDocValuesType(type);
+
+ return this;
+ }
+
+
+
+ /**
+ * Same functionality as in upper class method, but returns this as sugar.
+ **/
+ public DynamicFieldType setIndexOptionS(IndexOptions value)
+ {
+ super.setIndexOptions(value);
+
+ return this;
+ }
+
+
+
+ // /**
+ // * Same functionality as in upper class method, but returns this as sugar.
+ // **/
+ // public DynamicFieldType setIndexeD(boolean value)
+ // {
+ // super.setIndexed(value);
+ //
+ // return this;
+ // }
+
+
+
+ /**
+ * Same functionality as in upper class method, but returns this as sugar.
+ **/
+ public DynamicFieldType setNumericPrecisionSteP(int precisionStep)
+ {
+ super.setNumericPrecisionStep(precisionStep);
+
+ return this;
+ }
+
+
+
+
+ /**
+ * Same functionality as in upper class method, but returns this as sugar.
+ **/
+ public DynamicFieldType setNumericTypE(LegacyNumericType type)
+ {
+ super.setNumericType(type);
+
+ return this;
+ }
+
+
+
+
+ /**
+ * Same functionality as in upper class method, but returns this as sugar.
+ **/
+ public DynamicFieldType setOmitNormS(boolean value)
+ {
+ super.setOmitNorms(value);
+
+ return this;
+ }
+
+
+
+
+ /**
+ * Same functionality as in upper class method, but returns this as sugar.
+ **/
+ public DynamicFieldType setStoreD(boolean value)
+ {
+ super.setStored(value);
+
+ return this;
+ }
+
+
+
+
+ /**
+ * Same functionality as in upper class method, but returns this as sugar.
+ **/
+ public DynamicFieldType setStoreTermVectorOffsetS(boolean value)
+ {
+ super.setStoreTermVectorOffsets(value);
+
+ return this;
+ }
+
+
+
+
+ /**
+ * Same functionality as in upper class method, but returns this as sugar.
+ **/
+ public DynamicFieldType setStoreTermVectorPayloadS(boolean value)
+ {
+ super.setStoreTermVectorPayloads(value);
+
+ return this;
+ }
+
+
+
+
+ /**
+ * Same functionality as in upper class method, but returns this as sugar.
+ **/
+ public DynamicFieldType setStoreTermVectorPositionS(boolean value)
+ {
+ super.setStoreTermVectorPositions(value);
+
+ return this;
+ }
+
+
+
+
+ /**
+ * Same functionality as in upper class method, but returns this as sugar.
+ **/
+ public DynamicFieldType setStoreTermVectorS(boolean value)
+ {
+ super.setStoreTermVectors(value);
+
+ return this;
+ }
+
+
+
+
+ /**
+ * Same functionality as in upper class method, but returns this as sugar.
+ **/
+ public DynamicFieldType setTokenizeD(boolean value)
+ {
+ super.setTokenized(value);
+
+ return this;
+ }
+
+
+
+
+ public String toJson(boolean bFormatIt)
+ {
+ try
+ {
+ String strJson = JsonWriter.objectToJson(this);
+
+ if (bFormatIt)
+ strJson = JsonWriter.formatJson(strJson);
+
+ // TODO abchecken, ob das noch nötig ist: https://github.com/jdereg/json-io/issues/19
+ return strJson.replaceAll(",\\s*\"ordinal\":\\d+", "");
+ } catch (IOException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/FieldConfig.java b/src/main/java/de/dfki/km/leech/lucene/basic/FieldConfig.java
new file mode 100644
index 0000000..b352661
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/FieldConfig.java
@@ -0,0 +1,135 @@
+package de.dfki.km.leech.lucene.basic;
+
+
+
+import com.cedarsoftware.util.io.JsonReader;
+import com.cedarsoftware.util.io.JsonWriter;
+// import de.dfki.inquisitor.lucene.LuceneAnalyzerFactory;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.document.Field;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+
+
+public class FieldConfig
+{
+
+
+
+ public DynamicFieldType defaultFieldType = new DynamicFieldType();
+
+
+
+ public HashMap fieldName2FieldType = new HashMap();
+
+
+
+ /**
+ * Creates a new Analyzer out of this {@link FieldConfig}, which is a {@link PerFieldAnalyzerWrapper} for all configured fields
+ *
+ * @return the according analyzer
+ *
+ * @throws Exception
+ */
+ public Analyzer createAnalyzer() throws Exception
+ {
+ return LuceneAnalyzerFactory.createAnalyzer(this);
+ }
+
+
+
+
+
+
+
+ /**
+ * Create Field instances, according to the fieldType mappings inside this {@link FieldConfig}. Number fields will be generated, if a string value is given, it will
+ * be converted in the case the fieldType is a number type. Further, the method parses Strings for date if the fieldtype is of type {@link DynamicFieldType} and
+ * configured accordingly. You can also give number values for generating number or String fields fields (also according to the given fieldType).
+ *
+ * @param strAttName the attributes name
+ * @param attValue the attributes value
+ *
+ * @return the field, with the configured fieldType. Null in the case the Field can not be generated out of the value.
+ */
+ public Field createField(String strAttName, Object attValue)
+ {
+ DynamicFieldType fieldType = getFieldType(strAttName);
+
+ return fieldType.createField(strAttName, attValue);
+ }
+
+
+
+
+
+ public void fromJson(String strJson)
+ {
+
+ try
+ {
+ FieldConfig fieldConfig = (FieldConfig) JsonReader.jsonToJava(strJson);
+
+ this.defaultFieldType = fieldConfig.defaultFieldType;
+
+ this.fieldName2FieldType = fieldConfig.fieldName2FieldType;
+
+
+ }
+ catch (IOException e)
+ {
+ Logger.getLogger(FieldConfig.class.getName()).log(Level.SEVERE, "Error", e);
+ }
+
+ }
+
+
+
+ /**
+ * Gets the field type for a specific field, as configured. In the case there is no explicit mapping for the field, the default type will be returned.
+ *
+ * @param strFieldName
+ * @return
+ */
+ public DynamicFieldType getFieldType(String strFieldName)
+ {
+ DynamicFieldType fieldType = fieldName2FieldType.get(strFieldName);
+
+ if(fieldType == null) fieldType = defaultFieldType;
+
+ return fieldType;
+ }
+
+
+
+ public String toJson(boolean bFormatIt)
+ {
+ try
+ {
+
+
+ HashMap hsOptions = new HashMap<>();
+ hsOptions.put(JsonWriter.ENUM_PUBLIC_ONLY, true);
+
+ String strJson = JsonWriter.objectToJson(this, hsOptions);
+
+
+ if(bFormatIt) strJson = JsonWriter.formatJson(strJson);
+
+ // return strJson.replaceAll(",\\s*\"ordinal\":\\d+", "");
+ return strJson;
+
+ }
+ catch (IOException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+
+
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/IndexAccessor.java b/src/main/java/de/dfki/km/leech/lucene/basic/IndexAccessor.java
new file mode 100644
index 0000000..08d4a8e
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/IndexAccessor.java
@@ -0,0 +1,1634 @@
+package de.dfki.km.leech.lucene.basic;
+
+
+
+import de.dfki.inquisitor.exceptions.ExceptionUtils;
+import de.dfki.inquisitor.logging.LoggingUtils;
+import de.dfki.inquisitor.text.StringUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.*;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.store.NativeFSLockFactory;
+import org.apache.lucene.util.Version;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.*;
+import java.util.Map.Entry;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+
+
+@SuppressWarnings({"JavaDoc", "PointlessBooleanExpression"})
+public class IndexAccessor
+{
+
+ public static class BetterMultiReader extends MultiReader
+ {
+
+
+ public BetterMultiReader(IndexReader... subReaders) throws IOException
+ {
+ super(subReaders);
+ }
+
+
+
+ public BetterMultiReader(IndexReader[] subReaders, boolean closeSubReaders) throws IOException
+ {
+ super(subReaders, closeSubReaders);
+ }
+
+
+
+ public List extends IndexReader> getSubReaders()
+ {
+ return getSequentialSubReaders();
+ }
+ }
+
+
+
+
+ /**
+ * Status constants for removeReaderFromCacheWhenPossible
+ *
+ * @author Christian Reuschling, Dipl.Ing.(BA)
+ */
+ public static enum ReaderStatus {
+ READER_CLOSED, READER_IN_QUEUE, READER_NOT_IN_CACHE;
+ }
+
+
+
+
+ protected static class ReaderRefreshRunnable implements Runnable
+ {
+
+ @Override
+ public void run()
+ {
+
+ try
+ {
+ while (true)
+ {
+
+ // wir warten das eingestellte Intervall
+
+ // ich hatte mal die Situation, daß der Thread nur im korrekten Intervall ausgeführt wird, wenn hier vor dem Sleep noch eine
+ // Ausgabe steht - da das eigentlich nicht sein kann, und das nur zum debuggen relevant war, mach ich das mal wieder weg. Er kam
+ // dann, aber halt nicht so oft. Aber schon innerhalb 2min (und nicht 10ms, wie ich es da wollte)
+ // LinkedList dummy = new LinkedList();
+ // System.err.print(".");
+ Thread.sleep(m_lReaderRefreshIntervall);
+
+ Logger.getLogger(this.getClass().getName()).fine("will refresh all index readers");
+
+ IndexAccessor.refreshAllIndexReaders();
+ }
+
+ }
+ catch (Exception e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+
+ private static String m_strIdAttributeName;
+
+ protected static Logger logger = Logger.getLogger(IndexAccessor.class.getName());
+
+ protected static Analyzer m_analyzer4writer;
+
+ // protected static boolean m_bNativeFileLock = true;
+
+ protected static HashMap m_hsIndexPathOrId2CurrentIndexReader = new HashMap();
+
+ // protected static HashMap m_hsIndexPathOrURL2CurrentRemoteSearcher = new HashMap();
+
+ // wenn man mehrere Instanzen von luceneIndexSet hat, darf trotzdem nur ein Writer pro Index offen sein
+ protected static HashMap m_hsIndexPathOrURL2Writer = new HashMap();
+
+ protected static HashMap m_hsIndexReader2IndexPath = new HashMap();
+
+ protected static HashMap m_hsIndexReader2ReaderRefCount = new HashMap();
+
+
+ protected static HashMap m_hsIndexWriter2WriterRefCount = new HashMap();
+
+
+
+
+
+
+ protected static HashSet m_hsReader2Remove = new HashSet();
+
+
+
+ protected static HashSet m_hsStaticIndexReaderSet = new HashSet();
+
+
+
+
+
+ protected static long m_lReaderRefreshIntervall = 1000 * 60 * 2;
+
+
+
+ static
+ {
+
+ try
+ {
+
+
+
+ // wir starten den Thread, der die reader objekte refreshed
+
+ Thread readerRefreshThread = new Thread(new ReaderRefreshRunnable(), "IndexAccessor reader refresh thread");
+ readerRefreshThread.setDaemon(true);
+ // welche Priority? ich hatte mal das Gefühl, daß der recht selten dran kommt
+ // readerRefreshThread.setPriority(Thread.MIN_PRIORITY);
+ // readerRefreshThread.setPriority(Thread.MAX_PRIORITY);
+ readerRefreshThread.start();
+
+
+
+ // ein shutdown hook um sicherzustellen, daß auch alle Objekte geschlossen werden - wir wollen ja keine anderen Prozesse blockieren
+
+ Runtime.getRuntime().addShutdownHook(new Thread()
+ {
+ @Override
+ public void run()
+ {
+ try
+ {
+ IndexAccessor.forceCloseAll();
+ }
+ catch (Exception ex)
+ {
+ throw new RuntimeException(ex);
+ }
+ }
+ });
+
+
+ }
+ catch (Exception e)
+ {
+ Logger.getLogger(IndexAccessor.class.getName()).log(Level.SEVERE, "Error", e);
+ }
+
+ }
+
+
+
+
+ /**
+ * Adds a reader object to the cache. This reader will be static, which means that it won't be refreshed in any case, independent of which method you invoke on
+ * {@link IndexAccessor}, nor in the refresh-Thread. You can get this reader with {@link #getLuceneIndexReader(String, boolean)}, with strIndexID as parameter.You also can remove
+ * the reader from cache with {@link #removeReaderFromCache(String)}, {@link #removeReaderFromCacheWhenPossible(String)} and {@link #removeUnusedReadersFromCache()}
+ *
+ *
+ * @param strIndexID a unique ID for the reader
+ * @param staticReader the reader Object
+ */
+ synchronized static public void addStaticReader(String strIndexID, IndexReader staticReader)
+ {
+ // wir merken uns den Reader, damit wir ihn nicht später aus Versehen ersetzen/refreshen
+ m_hsStaticIndexReaderSet.add(staticReader);
+
+ // und mit seiner ID kommt er auch noch in den Cache
+ m_hsIndexPathOrId2CurrentIndexReader.put(strIndexID, staticReader);
+ }
+
+
+
+ /**
+ * Creates a new, empty Lucene index under the given path
+ *
+ * @param strIndexPathOrURL the path for the new Lucene index. In the case the path does not exists, it will be created
+ * @param bForceAndOverwrite if this is false, the index will be only created in the case there is no existing index under strIndexPathOrURL
+ *
+ * @return true in the case the index was newly created, false otherwise. In the case strIndexPathOrURL exists and is a file, it will not created in any case
+ *
+ * @throws IOException
+ * @throws CorruptIndexException
+ */
+ synchronized static public boolean createNewIndex(String strIndexPathOrURL, boolean bForceAndOverwrite) throws CorruptIndexException, IOException
+ {
+ boolean bCreateNew = false;
+
+ File fIndexPath = new File(strIndexPathOrURL);
+
+ if(!fIndexPath.exists())
+ {
+ fIndexPath.mkdirs();
+
+ bCreateNew = true;
+ }
+
+ FSDirectory dir = createFSDirectory(fIndexPath);
+
+ if(bCreateNew == false && (!DirectoryReader.indexExists(dir) || bForceAndOverwrite))
+ {
+ bCreateNew = true;
+ }
+
+ if(!bCreateNew) return false;
+
+
+
+ logger.fine("will open indexWriter for '" + strIndexPathOrURL + "'");
+
+ // wenn fäschlicherweise z.B. ein video-attachment als fulltext verarbeitet wird, haben wir riesige Docs, viel Speicher, lange Zeiten...aus
+ // diesem Grund setzte ich die MaxFieldLength mal wieder auf limited
+ @SuppressWarnings("deprecation")
+ IndexWriter ourIndexWriter = new IndexWriter(dir, new IndexWriterConfig(getDefaultAnalyzer()).setOpenMode(OpenMode.CREATE));
+
+ ourIndexWriter.close();
+
+ return true;
+ }
+
+
+
+
+
+ // /**
+ // * Enable or disable native file locking. We recommend the native lock, which is also the default.
+ // *
+ // * @param bNativeFileLock true in the case you want to use native file OS locks. These could be problematic on NFS drives (see {@link NativeFSLockFactory}). I
+ // * recommend to use the native File lock (stress tests on our NFS system have shown that this is really an atomar, working lock - the other lock leads to
+ // * exceptions (at least in ealier versions of Lucene)
+ // */
+ // static public void enableNativeFileLock(boolean bNativeFileLock)
+ // {
+ // m_bNativeFileLock = bNativeFileLock;
+ // }
+
+
+
+ /**
+ * Gets the default analyzer that will be used for writer creation
+ *
+ * @return the default analyzer that will be used for writer creation
+ */
+ static public Analyzer getDefaultAnalyzer()
+ {
+ return m_analyzer4writer;
+ }
+
+
+
+ /**
+ * Gets the default attribute name that will be used for RemotIndexReader creation
+ *
+ * @return the default attribute name that will be used for RemotIndexReader creation
+ */
+ static public String getDefaultIndexIdAttribute()
+ {
+ return IndexAccessor.m_strIdAttributeName;
+ }
+
+
+ //
+ // /**
+ // * Gets the reader for a given index path. The reader will be refreshed if there are any new changes in the index. In the case you pass an static reader ID to this
+ // * method, it will be identically to {@link #getIndexReader(String)}. You dont have to release a RemoteIndexReader.
+ // *
+ // * @param strIndexPathOrURL the path to the index where you want to read from
+ // *
+ // * @return the reader object that reflects the current state of the index
+ // *
+ // * @throws IOException
+ // * @throws CorruptIndexException
+ // * @throws URISyntaxException
+ // */
+ // public synchronized static RemoteIndexReader getFreshIndexReader(String strIndexPathOrURL) throws CorruptIndexException, IOException, URISyntaxException
+ // {
+ // refreshIndexReader(strIndexPathOrURL, false);
+ //
+ // return getIndexReader(strIndexPathOrURL);
+ // }
+
+ //
+ //
+ // /**
+ // * Gets the reader for the given index path. The reader will be created when necessary. In the case the specified directory does not exists or is empty, an empty
+ // * index will NOT be created.
+ // * Remark:
+ // * Note that refreshing a reader is a relatively expensive operation. The reader Object returned from this method must not reflect the current state of the index. To
+ // * get a guaranteed up to date, refreshed reader object, you have the following possibilities:
+ // * invoke one of the methods {@link #refreshIndexReader(String)} or {@link #refreshAllIndexReaders()} use the method {@link #getFreshIndexReader(String)}
+ // *
+ // * You can also set a time intervall where all reader Objects will be refreshed for {@link #getLuceneIndexReader(String, boolean)} periodically with the method
+ // * {@link #setReaderRefreshIntervall(long)}
+ // * You dont have to release a RemoteIndexReader.
+ // *
+ // * @param strIndexPathOrURL the path to the index you wants to read from
+ // *
+ // * @return the index reader object
+ // *
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // * @throws URISyntaxException
+ // */
+ // public synchronized static RemoteIndexReader getIndexReader(String strIndexPathOrURL) throws CorruptIndexException, IOException, URISyntaxException
+ // {
+ // return getIndexReader(strIndexPathOrURL, false);
+ // }
+
+
+ //
+ // /**
+ // * Gets the reader for the given index path. The reader will be created when necessary. In the case the specified directory does not exists or is empty, an empty
+ // * index will be created, if you want.
+ // * Remark:
+ // * Note that refreshing a reader is a relatively expensive operation. The reader Object returned from this method must not reflect the current state of the index. To
+ // * get a guaranteed up to date, refreshed reader object, you have the following possibilities:
+ // * invoke one of the methods {@link #refreshIndexReader(String)} or {@link #refreshAllIndexReaders()} use the method {@link #getFreshIndexReader(String)}
+ // *
+ // * You can also set a time intervall where all reader Objects will be refreshed for {@link #getIndexReader(String, boolean)} periodically with the method
+ // * {@link #setReaderRefreshIntervall(long)}
+ // * You dont have to release a RemoteIndexReader.
+ // *
+ // * @param strIndexPathOrURL the path to the index you wants to read from. This can be a simple path 'e.g. /home/hitzliputzli' or with URI Syntax
+ // * ('file:\\/home/hitzliputzli'). In the case the specified protocoll is not of type 'file', and delight is in the classpath, the method tries to create a
+ // * delight client object.
+ // * @param bCreateIndexIfNotExist if true, the index will be created in the case he did not exist
+ // *
+ // * @return the index reader object
+ // *
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // * @throws URISyntaxException
+ // */
+ // synchronized static public RemoteIndexReader getIndexReader(String strIndexPathOrURL, boolean bCreateIndexIfNotExist) throws CorruptIndexException, IOException,
+ // URISyntaxException
+ // {
+ //
+ // RemoteIndexReader remoteIndexReader;
+ //
+ //
+ // if(isLocalPath(strIndexPathOrURL))
+ // {
+ // // lokal - wir rufen einfach die entsprechene LuceneReader-Methode einmal auf, um das Objekt intern zu erstellen
+ // IndexReader luceneIndexReader = getLuceneIndexReader(strIndexPathOrURL, bCreateIndexIfNotExist);
+ // releaseLuceneIndexReader(luceneIndexReader);
+ //
+ // // das zugrundeliegende Objekt wurde initialisiert, nun einfach den String/Pfad basierten 'wrapper'
+ // remoteIndexReader = new RemoteIndexReaderImpl(strIndexPathOrURL, m_strIdAttributeName);
+ // }
+ // else
+ // {
+ // // wir versuchen, eine Verbindung zu einem RemoteReader aufzubauen
+ // strIndexPathOrURL = strIndexPathOrURL.replaceAll("/$", "");
+ // String strHandlerName = strIndexPathOrURL.substring(strIndexPathOrURL.lastIndexOf('/') + 1) + "_reader";
+ // String strServiceUrl = strIndexPathOrURL.replaceAll("/[^/]+$", "");
+ //
+ //
+ // remoteIndexReader = delight.connectingTo(strServiceUrl).usingApi(strHandlerName, RemoteIndexReader.class);
+ // }
+ //
+ //
+ // return remoteIndexReader;
+ // }
+
+
+
+
+
+ /**
+ * Gets all index paths that are currently inside the reader cache
+ *
+ * @return all index paths that are currently inside the reader cache
+ */
+ public static Set getIndexReaderPathsAndIDs()
+ {
+ return m_hsIndexPathOrId2CurrentIndexReader.keySet();
+ }
+
+
+ //
+ // synchronized static public RemoteIndexSearcher getIndexSearcher(String strIndexPathOrURL) throws CorruptIndexException, IOException, URISyntaxException
+ // {
+ // RemoteIndexSearcher searcher4Index;
+ //
+ //
+ // if(isLocalPath(strIndexPathOrURL))
+ // {
+ //
+ // // lokal - wir rufen einfach die entsprechene LuceneReader-Methode einmal auf, um das Objekt intern zu erstellen
+ // IndexReader luceneIndexReader = getLuceneIndexReader(strIndexPathOrURL, false);
+ // releaseLuceneIndexReader(luceneIndexReader);
+ //
+ // // das zugrundeliegende Objekt wurde initialisiert, nun einfach den String/Pfad basierten 'wrapper'
+ // searcher4Index = new RemoteIndexSearcherImpl(strIndexPathOrURL, m_strIdAttributeName);
+ // }
+ // else
+ // {
+ //
+ // // es gibt zumindest keinen lokalen Index - dann könnte es noch eine remotegeschichte sein
+ //
+ // searcher4Index = m_hsIndexPathOrURL2CurrentRemoteSearcher.get(strIndexPathOrURL);
+ // if(searcher4Index == null)
+ // {
+ //
+ // logger.fine("will create new remote searcher for index '" + strIndexPathOrURL + "'");
+ //
+ // strIndexPathOrURL = strIndexPathOrURL.replaceAll("/$", "");
+ // String strHandlerName = strIndexPathOrURL.substring(strIndexPathOrURL.lastIndexOf('/') + 1) + "_searcher";
+ // String strServiceUrl = strIndexPathOrURL.replaceAll("/[^/]+$", "");
+ //
+ //
+ // searcher4Index = delight.connectingTo(strServiceUrl).usingApi(strHandlerName, RemoteIndexSearcher.class);
+ //
+ //
+ // m_hsIndexPathOrURL2CurrentRemoteSearcher.put(strIndexPathOrURL, searcher4Index);
+ // }
+ // }
+ //
+ //
+ // return searcher4Index;
+ // }
+ //
+
+
+ /**
+ * Gets a writer instance for an index. DON'T !!!!! close your writer afterwards - use the >>>>> releaseIndexWriter(..) <<<<< method instead, and make SURE not to
+ * forget this. The close will be done automatically, and you would permit any other threads to work with the index by doing this. The default analyzer will be used
+ * In the case the specified directory does not exists or is empty, an empty index will be created.
+ * Remark:
+ * You can change the timeout Lucene waits for getting write access by setting IndexWriter.WRITE_LOCK_TIMEOUT
+ * It is in almost any case I can imagine no good idea to have an IndexWriter member variable that refers on the reference from this method. This will block all other
+ * processes that wants to get access to the index. You can make this in a short-living Object, but know exactly what yo do...
+ *
+ * @param strIndexPathOrURL the path to the index
+ *
+ * @return a writer instance for the given index. Autocommit will be FALSE.
+ *
+ * @throws CorruptIndexException
+ * @throws LockObtainFailedException
+ * @throws IOException
+ */
+ synchronized static public IndexWriter getIndexWriter(String strIndexPathOrURL) throws CorruptIndexException, LockObtainFailedException, IOException
+ {
+ if(getDefaultAnalyzer() == null) logger.severe("default analyzer is not set - this will cause a Nullpointer Exception. Set it before creating an IndexWriter.");
+ return getIndexWriter(strIndexPathOrURL, getDefaultAnalyzer());
+ }
+
+
+
+ /**
+ * Gets a writer instance for an index. DON'T !!!!! close your writer afterwards - use the >>>>> releaseWriter4DefaultIndex() <<<<< method instead, and make SHURE not
+ * to forget this. The close will be done automatically, and you would permit any other threads to work with the index by doing this
+ * In the case the specified directory does not exists or is empty, an empty index will be created.
+ * Remark:
+ * You can change the timeout Lucene waits for getting write access by setting IndexWriter.WRITE_LOCK_TIMEOUT
+ * It is in almost any case I can imagine no good idea to have an IndexWriter member variable that refers on the reference from this method. This will block all other
+ * processes that wants to get access to the index. You can make this in a short-living Object, but know exactly what yo do...
+ *
+ * @param strIndexPathOrURL the path to the index
+ * @param analyzer the Lucene analyzer that should be used for this writer creation
+ *
+ * @return a writer instance for the given index. Autocommit will be FALSE.
+ *
+ * @throws CorruptIndexException
+ * @throws LockObtainFailedException
+ * @throws IOException
+ */
+ @SuppressWarnings("deprecation")
+ synchronized static public IndexWriter getIndexWriter(String strIndexPathOrURL, Analyzer analyzer) throws CorruptIndexException, LockObtainFailedException,
+ IOException
+ {
+
+ // Haben wir schon einen geöffneten Writer?
+ IndexWriter ourIndexWriter = m_hsIndexPathOrURL2Writer.get(strIndexPathOrURL);
+
+
+ // wenn nicht, machen wir doch einen neuen
+ if(ourIndexWriter == null)
+ {
+ // wenn es ein leeres directory ist oder es nicht existiert, dann machen wir auch gleich einen neuen Index
+ createNewIndex(strIndexPathOrURL, false);
+
+ FSDirectory dir = createFSDirectory(new File(strIndexPathOrURL));
+
+ logger.fine("will open indexWriter for '" + strIndexPathOrURL + "'");
+
+ ourIndexWriter = new IndexWriter(dir, new IndexWriterConfig( analyzer).setOpenMode(OpenMode.APPEND));
+
+ m_hsIndexPathOrURL2Writer.put(strIndexPathOrURL, ourIndexWriter);
+ }
+
+ // wir verwalten Tokens - diese müssen wieder mit releaseWriter freigegeben werden
+ Integer iOld = m_hsIndexWriter2WriterRefCount.get(ourIndexWriter);
+ if(iOld == null)
+ m_hsIndexWriter2WriterRefCount.put(ourIndexWriter, 1);
+ else
+ m_hsIndexWriter2WriterRefCount.put(ourIndexWriter, ++iOld);
+
+ if(logger.isLoggable(Level.FINEST)) logger.finest("get indexWriter for '" + strIndexPathOrURL + "'\n" + LoggingUtils.getCurrentStackTrace());
+
+ return ourIndexWriter;
+ }
+
+
+
+ /**
+ * Gets all index paths that are currently inside the writer cache
+ *
+ * @return all index paths that are currently inside the writer cache
+ */
+ public static Set getIndexWriterPaths()
+ {
+ return m_hsIndexPathOrURL2Writer.keySet();
+ }
+
+
+
+ /**
+ * This is an expert method - the use of RemoteIndexReader is recommended. Gets the reader for the given index path. The reader will be created when necessary. In the
+ * case the specified directory does not exists or is empty, an empty index will be created, if you want.
+ * Remark:
+ * Note that refreshing a reader is a relatively expensive operation. The reader Object returned from this method must not reflect the current state of the index. To
+ * get a guaranteed up to date, refreshed reader object. You have the following possibilities:
+ * invoke one of the methods {@link #refreshIndexReader(String)} or {@link #refreshAllIndexReaders()}
+ *
+ * You can also set a time intervall where all reader Objects will be refreshed for {@link #getLuceneIndexReader(String, boolean)} periodically with the method
+ * {@link #setReaderRefreshIntervall(long)}
+ * Don't forget to release your reader Object with {@link #releaseLuceneIndexReader(IndexReader)}
+ *
+ * @param strIndexPathOrURL the path to the index you wants to read from. This can be a simple path 'e.g. /home/hitzliputzli' or with URI Syntax
+ * ('file:\\/home/hitzliputzli').
+ * @param bCreateIndexIfNotExist if true, the index will be created in the case he did not exist
+ *
+ * @return the index reader object
+ *
+ * @throws CorruptIndexException
+ * @throws IOException
+ * @throws URISyntaxException
+ */
+ synchronized static public IndexReader getLuceneIndexReader(String strIndexPathOrURL, boolean bCreateIndexIfNotExist) throws CorruptIndexException, IOException,
+ URISyntaxException
+ {
+ IndexReader reader = m_hsIndexPathOrId2CurrentIndexReader.get(strIndexPathOrURL);
+
+ // wenn wir noch keinen haben, dann erstellen wir uns einen
+ if(reader == null)
+ {
+
+ logger.fine("will create new reader for index '" + strIndexPathOrURL + "'");
+
+
+ File fIndex = null;
+ // die super-URI-Implementierung nimmt echt alles an, was auch keine Uri ist, ohne eine syntaxException - insbesondere einen Pfad :(
+
+ if(strIndexPathOrURL.startsWith("file:"))
+ fIndex = new File(new URI(strIndexPathOrURL));
+ else
+ fIndex = new File(strIndexPathOrURL);
+
+
+
+ // wenn es ein leeres directory ist oder es nicht existiert, dann machen wir auch gleich einen neuen Index
+ if(bCreateIndexIfNotExist) createNewIndex(strIndexPathOrURL, false);
+
+ Directory dir = createFSDirectory(fIndex);
+
+
+ reader = DirectoryReader.open(dir);
+
+
+ // hier steht immer der neueste drin - die alten werden in der release-methode wieder zu gemacht
+ m_hsIndexPathOrId2CurrentIndexReader.put(strIndexPathOrURL, reader);
+ }
+
+
+ // das Token wird für diesen Index inkrementiert
+ Integer iOld = m_hsIndexReader2ReaderRefCount.get(reader);
+ if(iOld == null)
+ {
+ m_hsIndexReader2ReaderRefCount.put(reader, 1);
+ m_hsIndexReader2IndexPath.put(reader, strIndexPathOrURL);
+ }
+ else
+ m_hsIndexReader2ReaderRefCount.put(reader, ++iOld);
+
+
+ if(logger.isLoggable(Level.FINEST)) logger.finest("get reader for index '" + strIndexPathOrURL + "'\n" + LoggingUtils.getCurrentStackTrace());
+
+ return reader;
+ }
+
+
+
+ synchronized static public IndexSearcher getLuceneIndexSearcher(String strIndexPathOrURL) throws CorruptIndexException, IOException, URISyntaxException
+ {
+ logger.fine("will create new searcher for index '" + strIndexPathOrURL + "'");
+
+ IndexSearcher searcher4Index = new IndexSearcher(getLuceneIndexReader(strIndexPathOrURL, false));
+
+
+
+ return searcher4Index;
+ }
+
+
+
+ synchronized static public IndexSearcher getLuceneMultiSearcher(LinkedHashSet sIndexPathsOrURLs) throws CorruptIndexException, IOException,
+ URISyntaxException
+ {
+ logger.fine("will create new searcher for index '" + sIndexPathsOrURLs + "'");
+
+ IndexSearcher searcher4Index = new IndexSearcher(getLuceneMultiReader(sIndexPathsOrURLs, false));
+
+
+
+ return searcher4Index;
+ }
+
+
+
+ /**
+ * Gets the lucene MultiReader for all given LOCAL reader paths (paths that point to the file system, not to a remote index). The readers will be created when
+ * necessary. In the case a specified directory does not exist or is empty, an empty index will be created, if you want.
+ * Remark:
+ * Note that refreshing a reader is a relatively expensive operation. The reader Object returned from this method must not reflect the current state of the index. To
+ * get a guaranteed up to date, refreshed reader object, you have the following possibilities:
+ * invoke one of the methods {@link #refreshIndexReader(String)} or {@link #refreshAllIndexReaders()}
+ *
+ * You can also set a time intervall where all reader Objects will be refreshed for {@link #getLuceneIndexReader(String, boolean)} periodically with the method
+ * {@link #setReaderRefreshIntervall(long)}
+ * You dont have to release a RemoteIndexReader.
+ *
+ * @param sIndexPathsOrURLs the paths to the indices you want to read from. This can be a simple path 'e.g. /home/hitzliputzli' or with URI Syntax
+ * ('file:\\/home/hitzliputzli'). In the case the specified protocoll is not of type 'file',
+ * @param bCreateIndexIfNotExist if true, the index will be created in the case he did not exist
+ *
+ * @return the index reader object
+ *
+ * @throws CorruptIndexException
+ * @throws IOException
+ * @throws URISyntaxException
+ */
+ synchronized static public MultiReader getLuceneMultiReader(LinkedHashSet sIndexPathsOrURLs, boolean bCreateIndexIfNotExist) throws CorruptIndexException,
+ IOException, URISyntaxException
+ {
+
+
+ LinkedList lReaders = new LinkedList<>();
+ for (String strIndexPathOrUrl : sIndexPathsOrURLs)
+ {
+
+ if(isLocalPath(strIndexPathOrUrl))
+ {
+ // lokal - wir rufen einfach die entsprechene LuceneReader-Methode einmal auf, um das Objekt intern zu erstellen
+ IndexReader luceneIndexReader = getLuceneIndexReader(strIndexPathOrUrl, bCreateIndexIfNotExist);
+
+
+ lReaders.add(luceneIndexReader);
+ }
+ else
+ {
+ // ignore
+ }
+
+ }
+
+
+ BetterMultiReader multiReader = new BetterMultiReader(lReaders.toArray(new IndexReader[0]), false);
+
+
+ return multiReader;
+ }
+
+
+ //
+ // /**
+ // * Gets a MultiReader that wrapps all index readers for the given Set of index paths. You dont have to release a RemoteIndexReader.
+ // *
+ // * @param indexPathsOrIDs2CreateIfNotExist the set of indices that should be wrapped by the MultiReader. The last reader in the list will be stable with respect to
+ // * write modifications during the livetime of this MultiReader, because the documents index number will stay stable in this index. For each index, you can
+ // * specify whether she should be created or not in the case it not exists.
+ // *
+ // * @return a MultiReader the wrapps all index readers for the given Set of index paths. You dont have to release a RemoteIndexReader.
+ // *
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // */
+ // public synchronized static RemoteIndexReader getMultiIndexReader(LinkedHashMap indexPathsOrIDs2CreateIfNotExist) throws CorruptIndexException,
+ // IOException
+ // {
+ //
+ // // wir trennen die lokalen von den remote-URLs. Mit den lokalen machen wir EINEN LuceneMultiReader, und dann packen wir die remotes dazu
+ //
+ // // Wir trennen in remote-und lokale Indizes
+ // LinkedList lLocalIndices = new LinkedList<>();
+ // LinkedList lRemoteIndices = new LinkedList<>();
+ //
+ // for (Entry strIndexPathOrURL2CreateIfNotExist : indexPathsOrIDs2CreateIfNotExist.entrySet())
+ // {
+ //
+ // String strIndexPathOrURL = strIndexPathOrURL2CreateIfNotExist.getKey();
+ // Boolean bCreateIfNotExist = strIndexPathOrURL2CreateIfNotExist.getValue();
+ //
+ // if(isLocalPath(strIndexPathOrURL))
+ // {
+ // lLocalIndices.add(strIndexPathOrURL);
+ // if(bCreateIfNotExist) createNewIndex(strIndexPathOrURL, false);
+ // }
+ // else
+ // {
+ // lRemoteIndices.add(strIndexPathOrURL);
+ // }
+ // }
+ //
+ //
+ // LinkedList llReaderz = new LinkedList();
+ //
+ // // der lokale MultiReader
+ // de.dfki.inquisition.lucene.RemoteIndexReader localReader = new RemoteIndexReaderImpl(lLocalIndices.toArray(new String[0]));
+ // localReader.setIdAttributename(m_strIdAttributeName);
+ // llReaderz.add(localReader);
+ //
+ //
+ // // die remote reader
+ // for (String strRemoteURL : lRemoteIndices)
+ // {
+ //
+ // try
+ // {
+ // // index creation is of no sense when we have a remote reader anyway
+ // de.dfki.inquisition.lucene.RemoteIndexReader reader = getIndexReader(strRemoteURL, false);
+ // // check if this reader is available
+ // reader.numDocs();
+ //
+ // llReaderz.add(reader);
+ // }
+ // catch (Exception e)
+ // {
+ // logger.log(Level.SEVERE, "Exception while creating a remote index reader. The index '" + strRemoteURL + "' will be ignored. ('" + e.getMessage() + "')");
+ // logger.log(Level.FINE, "Exception for index '" + strRemoteURL + "': ", e);
+ // }
+ // }
+ //
+ //
+ // // und daraus erzeugen wir uns jetzt nen MultiReader
+ // if(llReaderz.size() == 1) return llReaderz.get(0);
+ //
+ // RemoteMultiIndexReader multiReader = new RemoteMultiIndexReader(llReaderz.toArray(new de.dfki.inquisition.lucene.RemoteIndexReader[0]));
+ //
+ //
+ // return multiReader;
+ // }
+
+
+
+ // /**
+ // * Gets a MultiReader that wrapps all index readers for the given Set of index paths. You dont have to release a RemoteIndexReader.
+ // *
+ // * @param indexPathsOrIDs the set of indices that should be wrapped by the MultiReader. The last reader in the list will be stable with respect to write modifications
+ // * during the livetime of this MultiReader, because the documents index number will stay stable in this index. For each index, the index will NOT created
+ // * in the case it does not exists
+ // *
+ // * @return a MultiReader the wrapps all index readers for the given Set of index paths. You dont have to release a RemoteIndexReader.
+ // *
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // */
+ // public synchronized static RemoteIndexReader getMultiIndexReader(LinkedHashSet indexPathsOrIDs) throws CorruptIndexException, IOException
+ // {
+ // return getMultiIndexReader(indexPathsOrIDs, false);
+ // }
+
+ //
+ //
+ // /**
+ // * Gets a MultiReader that wrapps all index readers for the given Set of index paths. You dont have to release a RemoteIndexReader.
+ // *
+ // * @param indexPathsOrIDs the set of indices that should be wrapped by the MultiReader. The last reader in the list will be stable with respect to write modifications
+ // * during the livetime of this MultiReader, because the documents index number will stay stable in this index. For each index, the index will NOT created
+ // * in the case it does not exists (beside the last one if you want it)
+ // * @param bCreateLastIndexInListIfNotExist if true, the last index in the list will be created in the case it does not exist
+ // *
+ // * @return a MultiReader the wrapps all index readers for the given Set of index paths. You dont have to release a RemoteIndexReader.
+ // *
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // */
+ // public synchronized static RemoteIndexReader getMultiIndexReader(LinkedHashSet indexPathsOrIDs, boolean bCreateLastIndexInListIfNotExist)
+ // throws CorruptIndexException, IOException
+ // {
+ // LinkedHashMap hsIndexPathsOrIDs2CreateIfNotExist = new LinkedHashMap();
+ //
+ //
+ // int i = 0;
+ // for (String strIndexPathOrURL : indexPathsOrIDs)
+ // {
+ // boolean bCreateIfNotExist = false;
+ // if(i == indexPathsOrIDs.size() - 1) bCreateIfNotExist = bCreateLastIndexInListIfNotExist;
+ //
+ // hsIndexPathsOrIDs2CreateIfNotExist.put(strIndexPathOrURL, bCreateIfNotExist);
+ //
+ // i++;
+ // }
+ //
+ // return getMultiIndexReader(hsIndexPathsOrIDs2CreateIfNotExist);
+ // }
+
+
+ //
+ // /**
+ // * Gets a MultiReader that wrapps all currently cached index readers. You dont have to release a RemoteIndexReader.
+ // *
+ // * @param strLastIndexInListPathOrID this will be the last reader in the list of reader offered to the MultiReader Constructor. In this index you can write and read
+ // * in parallel, because the document numbers will not change during writing (until index optimization). In the case you don't write to any index, the order
+ // * is irrelevant and you can set this paraeter simply null
+ // *
+ // * @return a MultiReader that wrapps all currently cached index readers. You dont have to release a RemoteIndexReader.
+ // *
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // */
+ // public synchronized static RemoteMultiIndexReader getMultiIndexReader(String strLastIndexInListPathOrID) throws CorruptIndexException, IOException
+ // {
+ // return getMultiIndexReader(strLastIndexInListPathOrID, false);
+ // }
+
+
+
+ // /**
+ // * Gets a MultiReader that wrapps all currently cached index readers. Don't forget to release it with {@link #releaseLuceneIndexReader(IndexReader)}
+ // *
+ // * @param strLastIndexInListPathOrID this will be the last reader in the list of reader offered to the MultiReader Constructor. In this index you can write and read
+ // * in parallel, because the document numbers will not change during writing (until index optimization). In the case you don't write to any index, the order
+ // * is irrelevant and you can set this paraeter simply null
+ // * @param bCreateLastIndexInListIfNotExist if true, the last index in the list will be created in the case it does not exist
+ // *
+ // * @return a MultiReader that wrapps all currently cached index readers. You dont have to release a RemoteIndexReader.
+ // *
+ // * @throws CorruptIndexException
+ // * @throws IOException
+ // */
+ // public synchronized static RemoteMultiIndexReader getMultiIndexReader(String strLastIndexInListPathOrID, boolean bCreateLastIndexInListIfNotExist)
+ // throws CorruptIndexException, IOException
+ // {
+ // LinkedList llReaderz = new LinkedList();
+ //
+ //
+ // // der reader, auf den auch schreibend zugegriffen werden kann, machen wir am Schluß rein - ich habe die Hoffnung,
+ // // daß sich dann nicht die docIDs verschieben, wenn gleichzeitig geschrieben und in diesem und in externen Indices
+ // // gesucht wird...die externen müssen halt readonly sein...und des funzt auch :)
+ //
+ //
+ // HashSet hsIndexPaths = new HashSet();
+ // hsIndexPaths.addAll(getIndexReaderPathsAndIDs());
+ //
+ // // aaalso. wir erstellen alle Readers, und für den letzten wird das Flag eingesetzt...
+ // for (String strIndexPathOrURL : hsIndexPaths)
+ // {
+ //
+ // boolean bIsLast = strIndexPathOrURL.equals(strLastIndexInListPathOrID);
+ //
+ // try
+ // {
+ //
+ // de.dfki.inquisition.lucene.RemoteIndexReader reader;
+ // if(bIsLast)
+ // reader = getIndexReader(strIndexPathOrURL, bCreateLastIndexInListIfNotExist);
+ // else
+ // reader = getIndexReader(strIndexPathOrURL, false);
+ //
+ //
+ // if(strLastIndexInListPathOrID == null || llReaderz.size() == 0 || bIsLast)
+ // llReaderz.addLast(reader);
+ // else
+ // llReaderz.addFirst(reader);
+ //
+ // }
+ // catch (Exception e)
+ // {
+ // logger.log(Level.SEVERE, "Exception while creating a MultiReader. The index '" + strIndexPathOrURL + "' will be ignored. ('" + e.getMessage() + "')");
+ // logger.log(Level.FINE, "Exception for index '" + strIndexPathOrURL + "': ", e);
+ // }
+ // }
+ //
+ //
+ // // und daraus erzeugen wir uns jetzt nen MultiReader
+ // RemoteMultiIndexReader multiReader = new RemoteMultiIndexReader(llReaderz.toArray(new RemoteIndexReader[0]));
+ //
+ //
+ // return multiReader;
+ // }
+
+
+
+ // synchronized static public RemoteIndexSearcher getMultiIndexSearcher(LinkedHashSet indexPathsOrURLs) throws IOException, URISyntaxException
+ // {
+ //
+ // // - wir erzeugen uns einen searcher aus jeder Quelle - ganz einfach mit getIndexSearcher. Da wird dann auch die Unterscheidung zwischen
+ // // lokal- und remoteSearcher gemacht.
+ // // - wir nehmen den wunderschönen ParallelMultiSearcher - verteilte document frequency + multithreaded Suche....sehr schön :)...den gibts nicht mehr :(
+ //
+ //
+ //
+ // // Wir trennen in remote-und lokale Indizes
+ // LinkedList lLocalIndices = new LinkedList<>();
+ // LinkedList lRemoteIndices = new LinkedList<>();
+ //
+ // for (String strIndexPathOrURL : indexPathsOrURLs)
+ // {
+ // if(isLocalPath(strIndexPathOrURL))
+ // {
+ // lLocalIndices.add(strIndexPathOrURL);
+ // }
+ // else
+ // {
+ // lRemoteIndices.add(strIndexPathOrURL);
+ // }
+ // }
+ //
+ //
+ // LinkedList llSearcherz = new LinkedList();
+ //
+ // // der lokale MultiSearcher
+ // RemoteIndexSearcherImpl localSearcher = new RemoteIndexSearcherImpl(lLocalIndices.toArray(new String[0]));
+ // localSearcher.setIdAttributename(m_strIdAttributeName);
+ // llSearcherz.add(localSearcher);
+ //
+ //
+ // // die remote reader
+ // for (String strRemoteURL : lRemoteIndices)
+ // {
+ //
+ // try
+ // {
+ // RemoteIndexSearcher searcher = getIndexSearcher(strRemoteURL);
+ //
+ // // check if the remote index is up and running
+ // searcher.maxDoc();
+ //
+ // llSearcherz.add(searcher);
+ // }
+ // catch (Exception e)
+ // {
+ // logger.log(Level.SEVERE, "Exception while creating a MultiSearcher. The index '" + strRemoteURL + "' will be ignored. ('" + e.getMessage() + "')");
+ // logger.log(Level.FINE, "Exception for index '" + strRemoteURL + "': ", e);
+ // }
+ // }
+ //
+ //
+ // // und daraus erzeugen wir uns jetzt nen MultiSearcer
+ // if(llSearcherz.size() == 1) return llSearcherz.get(0);
+ //
+ // RemoteMultiIndexSearcher multiSearcher = new RemoteMultiIndexSearcher(llSearcherz.toArray(new RemoteIndexSearcher[0]));
+ //
+ //
+ // return multiSearcher;
+ //
+ //
+ //
+ // //
+ // //
+ // //
+ // //
+ // // LinkedList llSearchables = new LinkedList();
+ // //
+ // // for (String strIndexPathOrURL : indexPathsOrURLs)
+ // // {
+ // // try
+ // // {
+ // //
+ // // RemoteIndexSearcher searcher = getIndexSearcher(strIndexPathOrURL);
+ // // llSearchables.add(searcher);
+ // //
+ // // }
+ // // catch (Exception e)
+ // // {
+ // // logger.log(Level.SEVERE, "Exception while creating a MultiSearcher. The index '" + strIndexPathOrURL + "' will be ignored. ('" + e.getMessage() + "')");
+ // // logger.log(Level.FINE, "Exception for index '" + strIndexPathOrURL + "': ", e);
+ // // }
+ // // }
+ // //
+ // //
+ // // RemoteMultiIndexSearcher searcher = new RemoteMultiIndexSearcher(llSearchables.toArray(new RemoteIndexSearcher[0]));
+ // //
+ // //
+ // // return searcher;
+ // }
+
+
+ //
+ // synchronized static public RemoteIndexSearcher getMultiIndexSearcher(String strLastIndexInListPathOrID) throws IOException, URISyntaxException
+ // {
+ //
+ // LinkedList llIndices = new LinkedList();
+ //
+ //
+ // // der reader, auf den auch schreibend zugegriffen werden kann, machen wir am Schluß rein - ich habe die Hoffnung,
+ // // daß sich dann nicht die docIDs verschieben, wenn gleichzeitig geschrieben und in diesem und in externen Indices
+ // // gesucht wird...die externen müssen halt readonly sein...und des funzt auch :)
+ //
+ //
+ // HashSet hsIndexPaths = new HashSet();
+ // hsIndexPaths.addAll(getIndexReaderPathsAndIDs());
+ //
+ // // aaalso. wir erstellen alle Readers, und für den letzten wird das Flag eingesetzt...
+ // for (String strIndexPathOrURL : hsIndexPaths)
+ // {
+ //
+ // boolean bIsLast = strIndexPathOrURL.equals(strLastIndexInListPathOrID);
+ //
+ // if(strLastIndexInListPathOrID == null || llIndices.size() == 0 || bIsLast)
+ // llIndices.addLast(strIndexPathOrURL);
+ // else
+ // llIndices.addFirst(strIndexPathOrURL);
+ // }
+ //
+ //
+ // return getMultiIndexSearcher(new LinkedHashSet(llIndices));
+ // }
+
+
+
+
+
+ /**
+ * Gets the time intervall all reader objects will be refreshed automatically. After a refresh, all Objects from subsequent calls of {@link #getLuceneIndexReader(String, boolean)}
+ * will reflect the current state of an index, with any changes done.
+ *
+ * @return the reader refresh time intervall
+ */
+ static public long getReaderRefreshIntervall()
+ {
+ return m_lReaderRefreshIntervall;
+ }
+
+
+
+ // /**
+ // * Gets whether native file locking is enabled or not
+ // *
+ // * @return whether native file locking is enabled or not
+ // */
+ // static public boolean isNativeFileLockEnabled()
+ // {
+ // return m_bNativeFileLock;
+ // }
+
+
+
+
+ /**
+ * Returns true in the case a reader object for a given index path is inside the cache
+ *
+ * @param strIndexPathOrURL the index path for the reader object
+ *
+ * @return true in the case a reader object for the given index path is inside the cache
+ */
+ static public boolean isReaderInCache(String strIndexPathOrURL)
+ {
+ return m_hsIndexPathOrId2CurrentIndexReader.containsKey(strIndexPathOrURL);
+ }
+
+
+
+ /**
+ * Refreshs all index readers
+ *
+ * @throws CorruptIndexException
+ * @throws IOException
+ * @throws URISyntaxException
+ */
+ synchronized static public void refreshAllIndexReaders() throws CorruptIndexException, IOException, URISyntaxException
+ {
+ LinkedList llKeys = new LinkedList();
+ llKeys.addAll(m_hsIndexPathOrId2CurrentIndexReader.keySet());
+
+ for (String strIndexPathOrURL : llKeys)
+ refreshIndexReader(strIndexPathOrURL);
+
+ }
+
+
+
+
+
+
+
+ /**
+ * Refreshs an index reader for a given path. In the case the indexReader was not formerly created by {@link #getLuceneIndexReader(String, boolean)}, it will be
+ * created. In the case you will pass the ID of a static Reader, the method will do nothing.
+ *
+ * @param strIndexPath the path to the lucene index
+ *
+ * @throws CorruptIndexException
+ * @throws IOException
+ * @throws URISyntaxException
+ */
+ synchronized static public void refreshIndexReader(String strIndexPath) throws CorruptIndexException, IOException, URISyntaxException
+ {
+ refreshIndexReader(strIndexPath, false);
+ }
+
+
+
+ // static public boolean isLocalPath(String strIndexPathOrURL)
+ // {
+ // try
+ // {
+ //
+ // if(strIndexPathOrURL == null) return false;
+ //
+ // File fIndex = null;
+ // // die super-URI-Implementierung nimmt echt alles an, was auch keine Uri ist, ohne eine syntaxException - insbesondere einen Pfad :(
+ //
+ // if(strIndexPathOrURL.startsWith("file:"))
+ //
+ // fIndex = new File(new URI(strIndexPathOrURL));
+ // else
+ // fIndex = new File(strIndexPathOrURL);
+ //
+ //
+ // if(fIndex.exists()) return true;
+ //
+ // return false;
+ //
+ //
+ // }
+ // catch (URISyntaxException e)
+ // {
+ // return false;
+ // }
+ //
+ // }
+
+
+
+ /**
+ * Refreshs an index reader for a given path. In the case the indexReader was not formerly created by {@link #getLuceneIndexReader(String, boolean)}, it will be
+ * created. In the case the index does not exist, it will be created, if you want. In the case you will pass the ID of a static Reader, the method will do nothing.
+ *
+ * @param strIndexPath the path to the lucene index
+ * @param bCreateIndexIfNotExist if true, the index will be created in the case he did not exist
+ *
+ * @throws CorruptIndexException
+ * @throws IOException
+ * @throws URISyntaxException
+ */
+ synchronized static public void refreshIndexReader(String strIndexPath, boolean bCreateIndexIfNotExist) throws CorruptIndexException, IOException, URISyntaxException
+ {
+
+ // haben wir schon einen?
+ IndexReader readerOld = getLuceneIndexReader(strIndexPath, bCreateIndexIfNotExist);
+
+ // wenn es ein statischer Reader ist, dann wird der ned refreshed
+ if(m_hsStaticIndexReaderSet.contains(readerOld)) return;
+ // wenn es kein DirectoryReader ist, können wir ihn nicht refreshen
+ if(!(readerOld instanceof DirectoryReader)) return;
+ DirectoryReader dirReader = (DirectoryReader) readerOld;
+
+ try
+ {
+ if(dirReader.isCurrent()) return;
+
+ logger.info("will refresh reader for index '" + strIndexPath + "'");
+
+ // den neuen erstellen
+ // Directory dir = createFSDirectory(new File(strIndexPath));
+ //
+ // if(m_bLoadReadersInMemory) dir = new RAMDirectory(dir);
+ //
+ // IndexReader readerNew = IndexReader.open(dir, true);
+ IndexReader readerNew = DirectoryReader.openIfChanged(dirReader);
+
+
+ // hier steht immer der neueste drin - die alten werden in der release-methode wieder zu gemacht
+ m_hsIndexPathOrId2CurrentIndexReader.put(strIndexPath, readerNew);
+
+ }
+ catch (org.apache.lucene.store.AlreadyClosedException e)
+ {
+ logger.warning("reader for '" + strIndexPath + "' was closed at refresh time");
+ }
+ finally
+ {
+ // der alte Reader wird dann geschlossen, wenn er nicht mehr gebraucht wird
+ releaseLuceneIndexReader(readerOld);
+ }
+
+ }
+
+
+
+
+
+ /**
+ * Release your indexWriter that you get with getIndexWriter - in any case. In the case the IndexWriter is no more needed by some Instance, it will be commited and
+ * closed.
+ *
+ * @param indexWriter the writer Object that should be released
+ */
+ synchronized static public void releaseIndexWriter(IndexWriter indexWriter)
+ {
+ try
+ {
+ // wir dekrementieren den count für den aktuellen Index
+ Integer iOld = m_hsIndexWriter2WriterRefCount.get(indexWriter);
+ if(iOld == null || iOld == 0)
+ {
+ logger.warning("have no writer index token for '" + indexWriter + "'");
+ return;
+ }
+
+ // das müssen wir an dieser Stelle machen - wenn der writer geclosed ist, dann wirft getDirectory eine Exception
+ if(!(indexWriter.getDirectory() instanceof FSDirectory)) throw new IllegalStateException("Directory is not of type FSDirectory");
+
+ String strIndexPathOrURL = ((FSDirectory) indexWriter.getDirectory()).getDirectory().toAbsolutePath().toString();
+
+
+ int iNew = --iOld;
+
+ String strDontCloseIndexWriters = System.getProperty("de.dfki.inquisition.lucene.IndexAccessor.DontCloseIndexWriters");
+ boolean bIgnoreClose = false;
+ if(strDontCloseIndexWriters != null) bIgnoreClose = Boolean.parseBoolean(strDontCloseIndexWriters);
+
+ if(iNew == 0 && !bIgnoreClose)
+ {
+ // wenn wir bei 0 sind, dann mache mer des Ding gleich zu
+ Set> entrySet = m_hsIndexPathOrURL2Writer.entrySet();
+ Iterator> itEntries = entrySet.iterator();
+ while (itEntries.hasNext())
+ {
+ Entry entry = itEntries.next();
+ if(entry.getValue().equals(indexWriter)) itEntries.remove();
+ }
+
+
+ m_hsIndexWriter2WriterRefCount.remove(indexWriter);
+
+
+ logger.fine("will close indexWriter for '" + strIndexPathOrURL + "'");
+
+ indexWriter.commit();
+ if(isLocalPath(strIndexPathOrURL)) indexWriter.close();
+ }
+ else
+ m_hsIndexWriter2WriterRefCount.put(indexWriter, iNew);
+
+ if(logger.isLoggable(Level.FINEST))
+ {
+ if(bIgnoreClose)
+ logger.finest("indexWriter '" + strIndexPathOrURL + "' released - closing IGNORED (writer is still open)\n" + LoggingUtils.getCurrentStackTrace());
+ else
+ logger.finest("indexWriter '" + strIndexPathOrURL + "' released\n" + LoggingUtils.getCurrentStackTrace());
+ }
+
+ } catch (IOException e)
+ {
+ logger.severe(ExceptionUtils.createStackTraceString(e));
+ }
+ }
+
+
+
+ /**
+ * This is an expert method - the use of RemoteIndexReader is recommended (You don't need to release it). Releases your reader Object in the case you don't need it
+ * anymore. In the case every instance has released a specific index path, the reader object will be closed.
+ *
+ * @param reader the IndexReader Object you gets formerly with IndexAccessor
+ */
+ synchronized static public void releaseLuceneIndexReader(IndexReader reader)
+ {
+
+ try
+ {
+
+ if(reader instanceof BetterMultiReader)
+ {
+ for (IndexReader subReader : ((BetterMultiReader) reader).getSubReaders())
+ releaseLuceneIndexReader(subReader);
+
+ return;
+ }
+
+
+ String strIndexPathOrURL4Reader = m_hsIndexReader2IndexPath.get(reader);
+ if(strIndexPathOrURL4Reader == null)
+ logger.severe("have no path entry for reader. This is a hint to an error, e.g. you have released the reader too often, or the reader was not created with IndexAccessor.");
+
+
+ Integer iOldRefCount = m_hsIndexReader2ReaderRefCount.get(reader);
+
+ if(iOldRefCount == null || iOldRefCount == 0)
+ {
+ logger.warning("have no reader index token for '" + strIndexPathOrURL4Reader + "'");
+ return;
+ }
+
+ int iNew = --iOldRefCount;
+
+ if(iNew == 0)
+ {
+ // wenn wir bei 0 sind, dann mache mer des Ding gleich zu - wenn es nicht noch im Cache bleiben soll
+ m_hsIndexReader2ReaderRefCount.remove(reader);
+ m_hsIndexReader2IndexPath.remove(reader);
+
+ // wir schliessen den nur, wenn es nicht der aktuelle aus der hashmap ist - ansonsten müssten wir ihn ständig wieder neu erzeugen.
+ // der aktuelle wir dann geschlossen, wenn es einen neueren gibt oder explizit mit removeReaderFromCache
+
+ // wenn vorher gesagt wurde (mit removeReaderFromCacheWhenPossible), daß des Teil geschlossen werden soll, machen wir es auch zu
+
+ if(!m_hsIndexPathOrId2CurrentIndexReader.containsValue(reader))
+ {
+ // es ist nicht der aktuelle reader
+ if(isLocalPath(strIndexPathOrURL4Reader))
+ {
+ logger.info("will close indexReader '" + strIndexPathOrURL4Reader + "'");
+ reader.close();
+ }
+
+ }
+ else if(m_hsReader2Remove.contains(reader)) removeReaderFromCache(strIndexPathOrURL4Reader);
+
+ }
+ else
+ m_hsIndexReader2ReaderRefCount.put(reader, iNew);
+
+
+ if(logger.isLoggable(Level.FINEST)) logger.finest("indexReader '" + strIndexPathOrURL4Reader + "' released\n" + LoggingUtils.getCurrentStackTrace());
+
+
+ }
+ catch (IOException e)
+ {
+ logger.severe(ExceptionUtils.createStackTraceString(e));
+ }
+ }
+
+
+
+ synchronized static public void releaseLuceneIndexSearcher(IndexSearcher searcher)
+ {
+ releaseLuceneIndexReader(searcher.getIndexReader());
+ }
+
+
+
+ /**
+ * Removes an closes the reader object for a given index path from the cache. This is only possible in the case this object is no more in use - the method will throw
+ * an exception otherwise.
+ *
+ * @param strIndexPathOrURL the path to the index
+ *
+ * @throws IOException
+ */
+ synchronized static public void removeReaderFromCache(String strIndexPathOrURL) throws IOException
+ {
+ // wir haben immer den aktuellen reader für einen index im Speicher - hier können wir ihn wieder entfernen, um den Speicher freizugeben
+
+ // wenn der alte Reader nicht mehr benötigt wird, dann wird er geschlossen
+ IndexReader reader = m_hsIndexPathOrId2CurrentIndexReader.get(strIndexPathOrURL);
+
+ if(m_hsIndexReader2ReaderRefCount.get(reader) == null)
+ {
+ logger.fine("will close indexReader '" + strIndexPathOrURL + "'");
+ m_hsIndexPathOrId2CurrentIndexReader.remove(strIndexPathOrURL);
+ m_hsStaticIndexReaderSet.remove(reader);
+
+ if(isLocalPath(m_hsIndexReader2IndexPath.get(reader))) reader.close();
+
+ m_hsReader2Remove.remove(reader);
+ }
+ else
+ {
+ throw new IllegalStateException("Cannot remove reader object for '" + strIndexPathOrURL
+ + "' from cache. It is still in use. Did you forget an releaseIndexReader(..) invocation?");
+
+ }
+ }
+
+
+
+ /**
+ * Removes an closes the reader object for a given index path from the cache. This is only possible in the case this object is no more in use - otherwise, the reader
+ * Object will be removed from the cache immediately when it is no more in use.
+ *
+ * @param strIndexPathOrURL the path to the index
+ *
+ * @return READER_CLOSED in the case the reader was closed immediately, READER_IN_QUEUE if it is in the queue of 'to close readers' now. If the reader is not inside
+ * the cache, the method will return READER_NOT_IN_CACHE
+ *
+ * @throws IOException
+ */
+ synchronized static public ReaderStatus removeReaderFromCacheWhenPossible(String strIndexPathOrURL) throws IOException
+ {
+ // wir haben immer den aktuellen reader für einen index im Speicher - hier können wir ihn wieder entfernen, um den Speicher freizugeben
+
+ if(!isReaderInCache(strIndexPathOrURL)) return ReaderStatus.READER_NOT_IN_CACHE;
+
+ // wenn der alte Reader nicht mehr benötigt wird, dann wird er geschlossen
+ IndexReader reader = m_hsIndexPathOrId2CurrentIndexReader.get(strIndexPathOrURL);
+
+ if(m_hsIndexReader2ReaderRefCount.get(reader) == null)
+ {
+ logger.fine("will close indexReader '" + strIndexPathOrURL + "'");
+ m_hsIndexPathOrId2CurrentIndexReader.remove(strIndexPathOrURL);
+ m_hsStaticIndexReaderSet.remove(reader);
+
+ if(isLocalPath(m_hsIndexReader2IndexPath.get(reader))) reader.close();
+
+ return ReaderStatus.READER_CLOSED;
+
+ }
+ else
+ {
+ m_hsReader2Remove.add(reader);
+
+ return ReaderStatus.READER_IN_QUEUE;
+ }
+ }
+
+
+
+
+ // /**
+ // * Simply removes a formerly cached Searcher Object from the cache. Only remote Searcher proxies are cached - so this is only to give a possibility to free the memory
+ // * again (nevertheless, there should be not much amount of memory consumtion - in the case you have not thousands of searcher objects, you should be able to ignore
+ // * this...(hehe - I didn't say that ;) )
+ // *
+ // * @param strIndexPathOrURL the index for which you want to remove the according searcher proxy object out of the internal cache
+ // */
+ // synchronized static public void removeSearcherFromCache(String strIndexPathOrURL)
+ // {
+ // m_hsIndexPathOrURL2CurrentRemoteSearcher.remove(strIndexPathOrURL);
+ // }
+
+
+
+ /**
+ * Removes and closes all cached reader objects that are not in use. This method can be used safely at any time, the only disadvantage is that an subsequent
+ * invocation of {@link #getLuceneIndexReader(String, boolean)} for one of these indices will take longer time.
+ *
+ * @throws IOException
+ */
+ static public void removeUnusedReadersFromCache() throws IOException
+ {
+ LinkedList llIndexPaths = new LinkedList();
+
+ llIndexPaths.addAll(m_hsIndexPathOrId2CurrentIndexReader.keySet());
+
+ for (String strIndexPathOrURL : llIndexPaths)
+ try
+ {
+ removeReaderFromCache(strIndexPathOrURL);
+ }
+ catch (IllegalStateException e)
+ {
+ if(!e.getMessage().startsWith("Cannot remove reader object for")) throw e;
+ }
+ }
+
+
+
+ /**
+ * Sets the default analyzer that will be used for writer creation
+ *
+ * @param analyzer the default analyzer that will be used for writer creation
+ */
+ static public void setDefaultAnalyzer(Analyzer analyzer)
+ {
+ m_analyzer4writer = analyzer;
+ }
+
+
+
+ /**
+ * Sets the default attribute name that will be used for RemotIndexReader creation
+ *
+ * @param strIdAttributeName the default attribute name that will be used for RemotIndexReader creation
+ */
+ static public void setDefaultIndexIdAttribute(String strIdAttributeName)
+ {
+ IndexAccessor.m_strIdAttributeName = strIdAttributeName;
+ }
+
+
+
+ /**
+ * Sets the time intervall all reader objects will be refreshed automatically. After a refresh, all Objects from subsequent calls of {@link #getLuceneIndexReader(String, boolean)}
+ * will reflect the current state of an index, with any changes done.
+ *
+ * @param lMillis the time intervall the reader should be refreshed
+ *
+ * @return the former time intervall
+ */
+ static public long setReaderRefreshIntervall(long lMillis)
+ {
+ long lOld = m_lReaderRefreshIntervall;
+
+ m_lReaderRefreshIntervall = lMillis;
+
+ return lOld;
+ }
+
+
+
+ protected static FSDirectory createFSDirectory(File fDirPath) throws IOException
+ {
+ // das muß man so umständlich mit setLockfactory machen - wenn man einfach initial das die erstellt, und das dir wurde mit einer anderen
+ // LockFactory erstellt, dann kommt ne Exception
+
+
+ // null heißt SimpleFileLock (ich hab gekuckt ;) )
+ FSDirectory dir = FSDirectory.open(fDirPath.toPath());
+
+ // NativeFSLockFactory lockFactory = new NativeFSLockFactory(fDirPath);
+ // lockFactory.setLockPrefix("indexAccessor");
+ // if(isNativeFileLockEnabled()) dir.setLockFactory(lockFactory);
+
+ return dir;
+ }
+
+
+
+
+ /**
+ * Closes all reader and writer objects. This is mainly for the shutdown hook, to make shure that no other processes will be blocked by non-closed Objects
+ *
+ * @throws IOException
+ */
+ protected static void forceCloseAll() throws IOException
+ {
+ if(m_hsIndexReader2ReaderRefCount.size() == 0 && m_hsIndexPathOrURL2Writer.size() == 0) return;
+
+ logger.info("closing of all index readers and writers will be forced " + m_hsIndexReader2ReaderRefCount.size() + " reader(s), "
+ + m_hsIndexPathOrURL2Writer.size() + " writer(s)");
+
+
+ for (IndexReader reader : m_hsIndexReader2ReaderRefCount.keySet())
+ if(isLocalPath(m_hsIndexReader2IndexPath.get(reader))) reader.close();
+
+ for (Entry pathOrURL2Writer : m_hsIndexPathOrURL2Writer.entrySet())
+ {
+
+ String strPath = pathOrURL2Writer.getKey();
+ IndexWriter writer = pathOrURL2Writer.getValue();
+ writer.commit();
+
+ if(isLocalPath(strPath)) writer.close();
+ }
+ }
+
+
+
+ /**
+ * Gets all reader Objects that should be removed from the cache immediately when they are no more in use
+ *
+ * @return all reader Objects that should be removed from the cache immediately when they are no more in use
+ */
+ protected static HashSet getReader2RemoveQueue()
+ {
+ return m_hsReader2Remove;
+ }
+
+
+
+
+
+
+ /**
+ * Checks whether the given URL is a local one or not. Local means that the URL starts with 'file:' or that this path exists on the local storage.
+ */
+ protected static boolean isLocalPath(String strIndexPathOrURL)
+ {
+ if(StringUtils.nullOrWhitespace(strIndexPathOrURL)) return false;
+
+ File fIndex = null;
+ // die super-URI-Implementierung nimmt echt alles an, was auch keine Uri ist, ohne eine syntaxException - insbesondere einen Pfad :(
+
+ if(strIndexPathOrURL.startsWith("file:")) return true;
+
+ fIndex = new File(strIndexPathOrURL);
+
+
+ if(fIndex.exists()) return true;
+
+
+ return false;
+ }
+
+
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/LuceneAnalyzerFactory.java b/src/main/java/de/dfki/km/leech/lucene/basic/LuceneAnalyzerFactory.java
new file mode 100644
index 0000000..ce78cf9
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/LuceneAnalyzerFactory.java
@@ -0,0 +1,156 @@
+// * Created on 04.11.2005
+package de.dfki.km.leech.lucene.basic;
+
+
+
+// import de.dfki.inquisitor.lucene.DynamicFieldType;
+// import de.dfki.inquisitor.lucene.FieldConfig;
+import de.dfki.inquisitor.text.StringUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.util.Version;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.lang.reflect.Constructor;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map.Entry;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+
+
+
+public class LuceneAnalyzerFactory
+{
+
+ protected static Logger m_logger = Logger.getLogger(LuceneAnalyzerFactory.class.getName());
+
+
+
+
+ /**
+ * Creates a new Analyzer out of the given
+ *
+ * @return the according analyzer
+ *
+ * @throws Exception
+ */
+ public static Analyzer createAnalyzer(FieldConfig fieldConfig) throws Exception
+ {
+
+ String strDefaultAnalyzerName = fieldConfig.defaultFieldType.getAnalyzer();
+ Analyzer defaultAnalyzer = LuceneAnalyzerFactory.createAnalyzer(strDefaultAnalyzerName, null);
+
+
+ HashMap hsFieldName2Analyzer = new HashMap();
+ for (Entry fieldname2FieldType : fieldConfig.fieldName2FieldType.entrySet())
+ {
+ String strFieldName = fieldname2FieldType.getKey();
+ try
+ {
+ String strAnalyzer4Field = fieldname2FieldType.getValue().getAnalyzer();
+ if(!StringUtils.nullOrWhitespace(strAnalyzer4Field))
+ hsFieldName2Analyzer.put(strFieldName, LuceneAnalyzerFactory.createAnalyzer(strAnalyzer4Field, null));
+ }
+ catch (Exception e)
+ {
+ Logger.getLogger(LuceneAnalyzerFactory.class.getName()).warning("could not create analyzer from config of field '" + strFieldName + "'");
+ }
+ }
+
+ return new PerFieldAnalyzerWrapper(defaultAnalyzer, hsFieldName2Analyzer);
+ }
+
+
+
+ /**
+ * Creates a new Analyzer
.
+ *
+ * @param analyzerClassName The class name of the Analyzer
to be created.
+ * @param userGivenStopWordFileName The file name of the stop word file, or null
or empty, if no stop words should be set. If the given file name is
+ * relative
+ *
+ * @return the newly created analyzer
+ *
+ * @throws Exception
+ */
+ public static Analyzer createAnalyzer(String analyzerClassName, String userGivenStopWordFileName) throws Exception
+ {
+ try
+ {
+ Analyzer analyzer;
+
+ Class> analyzerClass = Class.forName(analyzerClassName);
+ if(!StringUtils.nullOrWhitespace(userGivenStopWordFileName))
+ {
+ Class>[] parameterClasses = { String[].class };
+ Constructor> constructor;
+ try
+ {
+ constructor = analyzerClass.getConstructor(parameterClasses);
+
+
+ m_logger.finer("creating Analyzer " + analyzerClassName + " with stopword file " + userGivenStopWordFileName);
+ InputStreamReader inReader = new InputStreamReader(new FileInputStream(userGivenStopWordFileName), "UTF-8");
+ BufferedReader reader = new BufferedReader(inReader);
+ ArrayList wordList = new ArrayList();
+ String stopWord = reader.readLine();
+ while (stopWord != null)
+ {
+ wordList.add(stopWord);
+ stopWord = reader.readLine();
+ }
+ reader.close();
+ String[] stopWords = wordList.toArray(new String[wordList.size()]);
+
+
+
+ Object[] parameters = { stopWords };
+ analyzer = (Analyzer) constructor.newInstance(parameters);
+ }
+ catch (NoSuchMethodException e)
+ {
+ m_logger.warning("Analyzer '" + analyzerClassName + "' cannot be parameterized with stop word list. Specified stop word list will be ignored");
+ constructor = analyzerClass.getConstructor(new Class[0]);
+ Object[] parameters = {};
+ analyzer = (Analyzer) constructor.newInstance(parameters);
+ }
+
+ }
+ else
+ {
+ m_logger.finer("creating Analyzer " + analyzerClassName + " without stopword file");
+
+
+ try
+ {
+ //we try if there is a constructor with a single Version parameter
+ Class>[] parameterClasses = { Version.class };
+ Constructor> constructor = analyzerClass.getConstructor(parameterClasses);
+
+ Object[] parameters = { Version.LUCENE_CURRENT };
+ analyzer = (Analyzer) constructor.newInstance(parameters);
+ }
+ catch (NoSuchMethodException e)
+ {
+ analyzer = (Analyzer) analyzerClass.newInstance();
+ }
+
+
+
+ }
+
+ return analyzer;
+
+ }
+ catch (Exception e)
+ {
+ m_logger.log(Level.WARNING, "Unable to instantiate Analyzer '" + analyzerClassName + "'.", e);
+ throw new Exception("Unable to instantiate Analyzer '" + analyzerClassName + "'.", e);
+ }
+ }
+
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/LuceneUtilz.java b/src/main/java/de/dfki/km/leech/lucene/basic/LuceneUtilz.java
new file mode 100644
index 0000000..612bacc
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/LuceneUtilz.java
@@ -0,0 +1,454 @@
+package de.dfki.km.leech.lucene.basic;
+
+
+
+import de.dfki.inquisitor.collections.TwoValuesBox;
+// import de.dfki.inquisitor.lucene.FieldConfig;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.*;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.*;
+import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
+import org.apache.lucene.util.Bits;
+
+import java.io.IOException;
+import java.util.*;
+
+
+
+public class LuceneUtilz
+{
+
+
+
+
+ /**
+ * There exists a bug in lucene (at least currently) which yields to the fact that some field attributes are gone if reading a document, which makes re-inserting this
+ * document to the index impossible. As workaround we reinsert all attributes with stored values again to the given document object, with the according fieldType from
+ * fieldConfig.
+ *
+ * @param doc the doc object that should be processed
+ */
+ static public void reInsertStoredFieldTypes(Document doc, FieldConfig fieldConfig)
+ {
+ LinkedList llReInsertFields = new LinkedList<>();
+
+ Iterator itFields = doc.iterator();
+ while (itFields.hasNext())
+ {
+ IndexableField oldField = itFields.next();
+
+ if(!oldField.fieldType().stored()) continue;
+
+ itFields.remove();
+
+ IndexableField newField;
+ if(oldField.fieldType().docValuesType() == DocValuesType.NUMERIC)
+ newField = fieldConfig.createField(oldField.name(), oldField.numericValue());
+ else
+ newField = fieldConfig.createField(oldField.name(), oldField.stringValue());
+
+ llReInsertFields.add(newField);
+ }
+
+ for (IndexableField newField : llReInsertFields)
+ doc.add(newField);
+
+ }
+
+
+
+ /**
+ * Extract all the terms in the index matching the query terms. Works also with wildcard queries
+ *
+ * @return the terms in the index matching the query terms. Works also with wildcard queries
+ */
+ @SuppressWarnings("javadoc")
+ static public Set extractQueryTerms(String strQuery, QueryParser queryParser, IndexReader reader)
+ {
+ try
+ {
+ Query query = queryParser.parse(strQuery);
+
+
+ return extractQueryTerms(query, reader);
+
+ }
+ catch (Exception e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+
+
+
+ /**
+ * Extract all the terms in the index matching the query terms. Works also with wildcard queries
+ *
+ * @return the terms in the index matching the query terms. Works also with wildcard queries
+ */
+ @SuppressWarnings("javadoc")
+ static public Set extractQueryTerms(Query query, IndexReader reader)
+ {
+ try
+ {
+
+ HashSet subQueries = LuceneUtilz.getSubQueries(query);
+ List> llQuery2FormerRewrite = new LinkedList<>();
+
+ for (Query subQuery : subQueries)
+ {
+ if(subQuery instanceof MultiTermQuery)
+ {
+ llQuery2FormerRewrite.add(new TwoValuesBox((MultiTermQuery) subQuery, ((MultiTermQuery) subQuery).getRewriteMethod()));
+ // das brauchen wir, damit Lucene wieder die Terme in BooleanQueries reinmultipliziert (prefixQueries, etc.)
+ ((MultiTermQuery) subQuery).setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
+ }
+ }
+
+ Query rewritten = query.rewrite(reader);
+
+ HashSet hsTerms = new HashSet<>();
+
+ Weight rewrittenWeight = rewritten.createWeight(new IndexSearcher(reader), false);
+ rewrittenWeight.extractTerms(hsTerms);
+ // rewritten.extractTerms(hsTerms);
+
+ // jetzt setzen wir die rewrite Method wieder auf das ursprüngliche zurück
+ for (TwoValuesBox subQuery2FormerRewrite : llQuery2FormerRewrite)
+ subQuery2FormerRewrite.getFirst().setRewriteMethod(subQuery2FormerRewrite.getSecond());
+
+
+ return hsTerms;
+
+ }
+ catch (Exception e)
+ {
+ throw new RuntimeException(e);
+ }
+
+ }
+
+
+
+ public static List analyzeText(String strFieldName, String strText, Analyzer analyzer, int iMaxResults)
+ {
+ try
+ {
+ LinkedList llTokenStrings = new LinkedList<>();
+
+ // wir analysieren/normalisieren den Term für den Lookup
+ TokenStream tokenstream = analyzer.tokenStream(strFieldName, strText);
+
+ CharTermAttribute termAtt = tokenstream.addAttribute(CharTermAttribute.class);
+ tokenstream.reset(); // Resets this stream to the beginning. (Required)
+
+ for (int i = 0; i < iMaxResults; i++)
+ {
+
+ if(!tokenstream.incrementToken()) break;
+
+ llTokenStrings.add(termAtt.toString());
+ }
+
+ tokenstream.end(); // Perform end-of-stream operations, e.g. set the final offset.
+ tokenstream.close(); // Release resources associated with this stream.
+
+
+ return llTokenStrings;
+
+ }
+ catch (Exception e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+
+
+
+ static public Bits bits4Doc(final int iDocId, final int iBitsLength)
+ {
+ return new Bits()
+ {
+
+ @Override
+ public boolean get(int index)
+ {
+ if(index == iDocId)
+ return true;
+ else
+ return false;
+ }
+
+
+
+ @Override
+ public int length()
+ {
+ return iBitsLength;
+ }
+ };
+ }
+
+
+
+
+ static public Bits bits4Docs(final Set sDocIds, final int iBitsLength)
+ {
+ return new Bits()
+ {
+
+ @Override
+ public boolean get(int index)
+ {
+ if(sDocIds.contains(index))
+ return true;
+ else
+ return false;
+ }
+
+
+
+ @Override
+ public int length()
+ {
+ return iBitsLength;
+ }
+ };
+ }
+
+
+
+
+
+
+ /**
+ * This method creates a query out of given text for a specific field, with a given analyzer. The method will create a TermQuery in the case the analyzer did not
+ * tokenized the input text, or a PhraseQuery in the case the analyzer did. All values in the query are fully analyzed an this searchable for the given field with
+ * respect to the given analyzer.
+ *
+ * @return a TermQuery, PhraseQuery or null in the case there was no text left after processing the text with the analyzer
+ */
+ public static Query createQuery(String strFieldName, String strText, Analyzer analyzer)
+ {
+ List lAnalyzedText = analyzeText(strFieldName, strText, analyzer, Integer.MAX_VALUE);
+
+ if(lAnalyzedText.size() > 1)
+ {
+ PhraseQuery pq = new PhraseQuery(strFieldName, lAnalyzedText.toArray(new String[0]));
+ // for (String strTerm : lAnalyzedText)
+ // pq.add(new Term(strFieldName, strTerm));
+
+ return pq;
+ }
+ else if(lAnalyzedText.size() == 1) return new TermQuery(new Term(strFieldName, lAnalyzedText.get(0)));
+
+ return null;
+ }
+
+
+
+ public static List getDocsWithTerm(Term term2search, int iMaxResults, IndexSearcher indexSearcher, Set fields2load)
+ {
+
+ try
+ {
+ LinkedList llDocs = new LinkedList<>();
+
+ TopDocs topDocs = indexSearcher.search(new TermQuery(term2search), iMaxResults);
+
+ for (int i = 0; i < topDocs.scoreDocs.length; i++)
+ {
+
+ int doc = topDocs.scoreDocs[i].doc;
+
+ if(fields2load == null)
+ llDocs.add(indexSearcher.doc(doc));
+ else
+ llDocs.add(indexSearcher.doc(doc, fields2load));
+
+ }
+
+ return llDocs;
+
+ }
+ catch (IOException e)
+ {
+ throw new RuntimeException(e);
+ }
+
+ }
+
+
+
+ /**
+ * Extracts all subqueries which have a boost factor of a given Query into an array
+ *
+ * @param query Query to extract subqueries from
+ * @return an array of the subqueries which have a boost factor
+ */
+ public static Set getSubClauses(Query query)
+ {
+ HashSet subqueries = new HashSet();
+
+ getSubClauses(query, subqueries);
+
+
+ return subqueries;
+ }
+
+
+
+ private static void getSubClauses(Query query, HashSet subClauses)
+ {
+ if(!(query instanceof BooleanQuery)) return;
+
+ BooleanClause[] queryClauses = ((BooleanQuery) query).clauses().toArray(new BooleanClause[0]);
+
+ for (BooleanClause clause : queryClauses)
+ {
+ subClauses.add(clause);
+
+ if(clause.getQuery() instanceof BooleanQuery) getSubClauses(clause.getQuery(), subClauses);
+ }
+ }
+
+
+
+ /**
+ * Extracts all subqueries of a given Query. The given query will also be part of the returned set.
+ *
+ * @param query Query to extract subqueries from
+ *
+ * @return all subqueries
+ */
+ public static HashSet getSubQueries(Query query)
+ {
+ HashSet subqueries = new HashSet();
+ getSubQueries(query, subqueries);
+
+ return subqueries;
+ }
+
+
+
+ protected static void getSubQueries(Query query, HashSet subQueries)
+ {
+ if(query instanceof BooleanQuery)
+ {
+ BooleanClause[] queryClauses = ((BooleanQuery) query).clauses().toArray(new BooleanClause[0]);
+
+ for (int i = 0; i < queryClauses.length; i++)
+ getSubQueries(queryClauses[i].getQuery(), subQueries);
+ }
+
+ subQueries.add(query);
+ }
+
+
+ //
+ // static public int getTermFrq4Doc(Term term, int iDocId, IndexReader reader)
+ // {
+ // return getTermFrq4Docs(term, bits4Doc(iDocId, reader.maxDoc()), reader);
+ // }
+ //
+ //
+ //
+ // static public int getTermFrq4Docs(Term term, Bits docBits, IndexReader reader)
+ // {
+ //
+ // try
+ // {
+ // DocsEnum docEnum = MultiFields.getTermDocsEnum(reader, docBits, term.field(), term.bytes());
+ // int termFreq = 0;
+ //
+ // @SuppressWarnings("unused")
+ // int doc = DocsEnum.NO_MORE_DOCS;
+ // while ((doc = docEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS)
+ // {
+ // termFreq += docEnum.freq();
+ // }
+ //
+ //
+ // return termFreq;
+ //
+ // }
+ // catch (Exception e)
+ // {
+ // throw new RuntimeException(e);
+ // }
+ // }
+ //
+ //
+ //
+ //
+ //
+ // static public int getTermFrq4Docs(Term term, Set sDocIds, IndexReader reader)
+ // {
+ // return getTermFrq4Docs(term, bits4Docs(sDocIds, reader.maxDoc()), reader);
+ // }
+ //
+ //
+ //
+ //
+ // static public int getTermFrq4Index(Term term, IndexReader reader)
+ // {
+ // return getTermFrq4Docs(term, MultiFields.getLiveDocs(reader), reader);
+ // }
+
+
+
+ /**
+ * Gets the document object and the index document index/number
+ */
+ @SuppressWarnings("javadoc")
+ public static TwoValuesBox getUniqueDocWithTerm(Term idTerm2search, IndexSearcher indexSearcher)
+ {
+ return getUniqueDocWithTerm(idTerm2search, indexSearcher, null);
+ }
+
+
+
+ /**
+ * Gets the document object and the index document index/number
+ */
+ @SuppressWarnings("javadoc")
+ public static TwoValuesBox getUniqueDocWithTerm(Term idTerm2search, IndexSearcher indexSearcher, Set fields2load)
+ {
+
+ try
+ {
+ // XXX hier wollen wir einen einfachen Collecor, wir brauchen keine Scores!
+ TopDocs topDocs = indexSearcher.search(new TermQuery(idTerm2search), 1);
+
+
+ if(topDocs.totalHits == 0) return null;
+
+ if(topDocs.totalHits > 1) throw new IllegalStateException("multiple document entries for ID term search");
+
+
+ int doc = topDocs.scoreDocs[0].doc;
+
+ Document document;
+ if(fields2load == null)
+ document = indexSearcher.doc(doc);
+ else
+ document = indexSearcher.doc(doc, fields2load);
+
+ if(document == null) return null;
+
+
+ return new TwoValuesBox(document, doc);
+
+ }
+ catch (IOException e)
+ {
+ throw new RuntimeException(e);
+ }
+
+ }
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/PageCountEstimator.java b/src/main/java/de/dfki/km/leech/lucene/basic/PageCountEstimator.java
new file mode 100644
index 0000000..7fce051
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/PageCountEstimator.java
@@ -0,0 +1,107 @@
+package de.dfki.km.leech.lucene.basic;
+
+
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.LegacyIntField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Terms;
+
+import java.io.IOException;
+
+
+
+public class PageCountEstimator
+{
+
+
+ /**
+ * Adds a page count attribute to a document in the case no one is there. The method estimates the page cont (i.e. 400 terms => 1 page).
+ *
+ * @param iDocNo the docs index number
+ * @param doc2modify the document
+ * @param strPageCountAttName the field for the generated page count (that should be created)
+ * @param isHeuristicPageCountFlagAttName an attribute name that will be generated as hint wether a document page count is calculated or not
+ * @param strBodyAttName the body attribute name to perform the calculation
+ * @param reader the lucene index reader
+ *
+ * @return true in the case the doc was modified, false otherwise
+ *
+ * @throws Exception
+ */
+ static public boolean addHeuristicDocPageCounts(int iDocNo, Document doc2modify, String strPageCountAttName, String isHeuristicPageCountFlagAttName,
+ String strBodyAttName, IndexReader reader) throws Exception
+ {
+ // sofern ein Attribut noch nicht vorhanden ist, wird es hier erzeugt - mit Hilfe einer Heuristik
+ // es wird auch noch ein zusätzliches Attribut eingetragen, welches anzeigt, daß die PageCount mit Hilfe
+ // einer Heuristik erzeugt wurde
+
+ // wenn es schon einen Eintrag für die Seitenzahlen gibt, wird das Dokument ignoriert (das war zumindest so, solange schöne Zahln im Index
+ // standen)
+ String strPageCountValue = doc2modify.get(strPageCountAttName);
+ // if(strPageCountValue != null)
+ if(strPageCountValue != null && doc2modify.get(isHeuristicPageCountFlagAttName) == null)
+ {
+
+ // wenn da so ein verkrutztes Leech-Ding drin steht, dann machen wir da ne schöne Zahl draus :)
+ int iIndexOfKrutzel = strPageCountValue.indexOf("^^");
+ if(iIndexOfKrutzel == -1) return false;
+
+ String strPageCountValueNice = strPageCountValue.substring(0, iIndexOfKrutzel);
+ doc2modify.removeFields(strPageCountAttName);
+
+ LegacyIntField field = new LegacyIntField(strPageCountAttName, Integer.parseInt(strPageCountValueNice), Store.YES);
+
+ if(field != null) doc2modify.add(field);
+
+ return true;
+ }
+
+ // wenn es keinen Eintrag für den Content gibt, wird das Dokument ebenfalls ignoriert
+ String strBodyValue = doc2modify.get(strBodyAttName);
+ if(strBodyValue == null) return false;
+
+ // wir haben einen Eintrag für den Body und keinen für die Seitenzahlen - also frisch ans Werk ;)
+
+ int iPageCount = 0;
+
+ // die Heuristik: 400 Terme ergeben eine Seite
+
+ int iDocTermCount = getDocumentTermCount(iDocNo, strBodyAttName, reader);
+
+ // ich sag jetzt mal einfach, daß ungefähr 400 Wörter auf einer Seite sind...
+ iPageCount = (iDocTermCount / 400) + 1;
+
+ // die geschätzte PageCount
+ doc2modify.removeFields(strPageCountAttName);
+ LegacyIntField field = new LegacyIntField(strPageCountAttName, iPageCount, Store.YES);
+ if(field != null) doc2modify.add(field);
+ // ein Flag, welches anzeigt, daß dieser TermCount geschätzt wurde
+ doc2modify.removeFields(isHeuristicPageCountFlagAttName);
+ StringField newField = new StringField(isHeuristicPageCountFlagAttName, "true", Store.YES);
+ if(newField != null) doc2modify.add(newField);
+
+
+ return true;
+ }
+
+
+
+ public static Integer getDocumentTermCount(int iDocNo, String strFieldName4TermCounting, IndexReader reader) throws IOException
+ {
+
+ long lTermCount = 0;
+
+
+ Terms termVector = reader.getTermVector(iDocNo, strFieldName4TermCounting);
+
+ // manchmal gibt es auch Dokumente, die keinen content bzw. keinen TermFreqVector haben....
+ if(termVector != null) lTermCount = termVector.getSumTotalTermFreq();
+
+
+ return Long.valueOf(lTermCount).intValue();
+ }
+
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/Term2FrequenciesEntry.java b/src/main/java/de/dfki/km/leech/lucene/basic/Term2FrequenciesEntry.java
new file mode 100644
index 0000000..80e06b6
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/Term2FrequenciesEntry.java
@@ -0,0 +1,41 @@
+package de.dfki.km.leech.lucene.basic;
+
+
+
+
+
+public class Term2FrequenciesEntry
+{
+
+ public String term;
+
+ public Integer documentFrequency;
+
+ public Long totalIndexFrequency;
+
+
+
+ public Term2FrequenciesEntry()
+ {
+ }
+
+
+
+ public Term2FrequenciesEntry(String term, Integer documentFrequency, Long totalIndexFrequency)
+ {
+ this.term = term;
+ this.documentFrequency = documentFrequency;
+ this.totalIndexFrequency = totalIndexFrequency;
+
+ }
+
+
+
+
+ @Override
+ public String toString()
+ {
+ return "Term:" + term + " docFRQ:" + documentFrequency + " totalFRQ:" + totalIndexFrequency;
+ }
+
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/Term2FrequencyEntry.java b/src/main/java/de/dfki/km/leech/lucene/basic/Term2FrequencyEntry.java
new file mode 100644
index 0000000..a2ffa7a
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/Term2FrequencyEntry.java
@@ -0,0 +1,65 @@
+package de.dfki.km.leech.lucene.basic;
+
+
+
+
+
+public class Term2FrequencyEntry
+{
+
+ public String term;
+
+ public Integer frequency;
+
+
+
+ public Term2FrequencyEntry()
+ {
+ }
+
+
+
+ public Term2FrequencyEntry(String strTerm, Integer iFrequency)
+ {
+ term = strTerm;
+ frequency = iFrequency;
+
+ }
+
+
+
+ public String getTerm()
+ {
+ return term;
+ }
+
+
+
+ public void setTerm(String term)
+ {
+ this.term = term;
+ }
+
+
+
+ public Integer getFrequency()
+ {
+ return frequency;
+ }
+
+
+
+ public void setFrequency(Integer frequency)
+ {
+ this.frequency = frequency;
+ }
+
+
+
+ @Override
+ public String toString()
+ {
+ return "Term:" + getTerm() + " FRQ:" + getFrequency();
+ }
+
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/TermPosition.java b/src/main/java/de/dfki/km/leech/lucene/basic/TermPosition.java
new file mode 100644
index 0000000..eb5336e
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/TermPosition.java
@@ -0,0 +1,59 @@
+package de.dfki.km.leech.lucene.basic;
+
+
+
+
+
+public class TermPosition
+{
+
+ Integer m_iEndOffset;
+
+ Integer m_iPosition;
+
+ Integer m_iStartOffset;
+
+
+
+ public Integer getEndOffset()
+ {
+ return m_iEndOffset;
+ }
+
+
+
+ public Integer getPosition()
+ {
+ return m_iPosition;
+ }
+
+
+
+ public Integer getStartOffset()
+ {
+ return m_iStartOffset;
+ }
+
+
+
+ public void setEndOffset(Integer endOffset)
+ {
+ m_iEndOffset = endOffset;
+ }
+
+
+
+ public void setPosition(Integer position)
+ {
+ m_iPosition = position;
+ }
+
+
+
+ public void setStartOffset(Integer startOffset)
+ {
+ m_iStartOffset = startOffset;
+ }
+
+
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/TextWithTermVectorOffsetsField.java b/src/main/java/de/dfki/km/leech/lucene/basic/TextWithTermVectorOffsetsField.java
new file mode 100644
index 0000000..f4c397c
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/TextWithTermVectorOffsetsField.java
@@ -0,0 +1,48 @@
+package de.dfki.km.leech.lucene.basic;
+
+
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+
+
+
+public class TextWithTermVectorOffsetsField extends Field
+{
+
+
+
+ /**
+ * Creates a new {@link TextWithTermVectorOffsetsField}. Default is to generate a stored field.
+ *
+ * @param name field name
+ * @param value String value
+ * @throws IllegalArgumentException if the field name or value is null.
+ */
+ public TextWithTermVectorOffsetsField(String name, String value)
+ {
+
+ super(name, value, new DynamicFieldType(TextField.TYPE_STORED).setStoreTermVectorS(true).setStoreTermVectorOffsetS(true).freezE());
+
+ }
+
+
+
+ /**
+ * Creates a new {@link TextWithTermVectorOffsetsField}
+ *
+ * @param name field name
+ * @param value String value
+ * @param stored Store.YES if the content should also be stored
+ * @throws IllegalArgumentException if the field name or value is null.
+ */
+ public TextWithTermVectorOffsetsField(String name, String value, Store stored)
+ {
+
+
+ super(name, value, new DynamicFieldType(stored == Store.YES ? TextField.TYPE_STORED : TextField.TYPE_NOT_STORED).setStoreTermVectorS(true)
+ .setStoreTermVectorOffsetS(true).freezE());
+
+ }
+
+
+}
diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/URINotFoundException.java b/src/main/java/de/dfki/km/leech/lucene/basic/URINotFoundException.java
new file mode 100644
index 0000000..a937b3d
--- /dev/null
+++ b/src/main/java/de/dfki/km/leech/lucene/basic/URINotFoundException.java
@@ -0,0 +1,38 @@
+package de.dfki.km.leech.lucene.basic;
+
+
+
+public class URINotFoundException extends Exception
+{
+ private static final long serialVersionUID = 8317129753714055831L;
+
+
+
+ public URINotFoundException()
+ {
+ super();
+ }
+
+
+
+ public URINotFoundException(String message, Throwable cause)
+ {
+ super(message, cause);
+ }
+
+
+
+ public URINotFoundException(String message)
+ {
+ super(message);
+ }
+
+
+
+ public URINotFoundException(Throwable cause)
+ {
+ super(cause);
+ }
+
+
+}
diff --git a/src/main/java/de/dfki/km/leech/parser/CrawlerParser.java b/src/main/java/de/dfki/km/leech/parser/CrawlerParser.java
index 907633e..fad0e0e 100644
--- a/src/main/java/de/dfki/km/leech/parser/CrawlerParser.java
+++ b/src/main/java/de/dfki/km/leech/parser/CrawlerParser.java
@@ -31,7 +31,7 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import de.dfki.inquisition.collections.MultiValueHashMap;
+import de.dfki.inquisitor.collections.MultiValueHashMap;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser;
import de.dfki.km.leech.sax.DataSinkContentHandler;
diff --git a/src/main/java/de/dfki/km/leech/parser/DirectoryCrawlerParser.java b/src/main/java/de/dfki/km/leech/parser/DirectoryCrawlerParser.java
index 4721d53..d5fc5c9 100644
--- a/src/main/java/de/dfki/km/leech/parser/DirectoryCrawlerParser.java
+++ b/src/main/java/de/dfki/km/leech/parser/DirectoryCrawlerParser.java
@@ -44,7 +44,7 @@
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
-import de.dfki.inquisition.collections.MultiValueHashMap;
+import de.dfki.inquisitor.collections.MultiValueHashMap;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.config.DirectoryCrawlerContext;
diff --git a/src/main/java/de/dfki/km/leech/parser/HtmlCrawlerParser.java b/src/main/java/de/dfki/km/leech/parser/HtmlCrawlerParser.java
index 5729f63..65b4da8 100644
--- a/src/main/java/de/dfki/km/leech/parser/HtmlCrawlerParser.java
+++ b/src/main/java/de/dfki/km/leech/parser/HtmlCrawlerParser.java
@@ -44,9 +44,9 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import de.dfki.inquisition.collections.MultiValueHashMap;
-import de.dfki.inquisition.processes.StopWatch;
-import de.dfki.inquisition.text.StringUtils;
+import de.dfki.inquisitor.collections.MultiValueHashMap;
+import de.dfki.inquisitor.processes.StopWatch;
+import de.dfki.inquisitor.text.StringUtils;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.config.HtmlCrawlerContext;
diff --git a/src/main/java/de/dfki/km/leech/parser/ImapCrawlerParser.java b/src/main/java/de/dfki/km/leech/parser/ImapCrawlerParser.java
index f54c5ac..e5fedd4 100644
--- a/src/main/java/de/dfki/km/leech/parser/ImapCrawlerParser.java
+++ b/src/main/java/de/dfki/km/leech/parser/ImapCrawlerParser.java
@@ -55,8 +55,8 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import de.dfki.inquisition.collections.MultiValueHashMap;
-import de.dfki.inquisition.text.StringUtils;
+import de.dfki.inquisitor.collections.MultiValueHashMap;
+import de.dfki.inquisitor.text.StringUtils;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.config.ImapCrawlerContext;
diff --git a/src/main/java/de/dfki/km/leech/parser/NonRecursiveCrawlerParser.java b/src/main/java/de/dfki/km/leech/parser/NonRecursiveCrawlerParser.java
index 816b8a8..747ee66 100644
--- a/src/main/java/de/dfki/km/leech/parser/NonRecursiveCrawlerParser.java
+++ b/src/main/java/de/dfki/km/leech/parser/NonRecursiveCrawlerParser.java
@@ -9,7 +9,7 @@
import org.apache.tika.parser.ParseContext;
import org.xml.sax.ContentHandler;
-import de.dfki.inquisition.collections.MultiValueHashMap;
+import de.dfki.inquisitor.collections.MultiValueHashMap;
import de.dfki.km.leech.SubDataEntityContentHandler;
import de.dfki.km.leech.metadata.LeechMetadata;
diff --git a/src/main/java/de/dfki/km/leech/parser/UrlListCrawlerParser.java b/src/main/java/de/dfki/km/leech/parser/UrlListCrawlerParser.java
index 8e60d1c..f5d9d8c 100644
--- a/src/main/java/de/dfki/km/leech/parser/UrlListCrawlerParser.java
+++ b/src/main/java/de/dfki/km/leech/parser/UrlListCrawlerParser.java
@@ -18,7 +18,7 @@
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
-import de.dfki.inquisition.collections.MultiValueHashMap;
+import de.dfki.inquisitor.collections.MultiValueHashMap;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.io.URLStreamProvider;
diff --git a/src/main/java/de/dfki/km/leech/parser/incremental/IncrementalCrawlingHistory.java b/src/main/java/de/dfki/km/leech/parser/incremental/IncrementalCrawlingHistory.java
index f3010da..7199b28 100644
--- a/src/main/java/de/dfki/km/leech/parser/incremental/IncrementalCrawlingHistory.java
+++ b/src/main/java/de/dfki/km/leech/parser/incremental/IncrementalCrawlingHistory.java
@@ -45,7 +45,7 @@
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Bits;
-import de.dfki.inquisition.text.StringUtils;
+import de.dfki.inquisitor.text.StringUtils;
import de.dfki.km.leech.config.CrawlerContext;
diff --git a/src/main/java/de/dfki/km/leech/parser/wikipedia/WikipediaDumpParser.java b/src/main/java/de/dfki/km/leech/parser/wikipedia/WikipediaDumpParser.java
index 87cbf39..adbf456 100644
--- a/src/main/java/de/dfki/km/leech/parser/wikipedia/WikipediaDumpParser.java
+++ b/src/main/java/de/dfki/km/leech/parser/wikipedia/WikipediaDumpParser.java
@@ -54,9 +54,9 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import de.dfki.inquisition.collections.MultiValueBalancedTreeMap;
-import de.dfki.inquisition.collections.MultiValueHashMap;
-import de.dfki.inquisition.text.StringUtils;
+import de.dfki.inquisitor.collections.MultiValueBalancedTreeMap;
+import de.dfki.inquisitor.collections.MultiValueHashMap;
+import de.dfki.inquisitor.text.StringUtils;
import de.dfki.km.leech.metadata.LeechMetadata;
import de.dfki.km.leech.util.TikaUtils;
@@ -658,7 +658,7 @@ protected void parseGeoCoordinates(String strText, Metadata metadata)
- protected void parseInfoBox(String strText, Metadata metadata, ContentHandler handler) throws SAXException
+ protected void parseInfoBox(String strText, Metadata metadata, ContentHandler handler) throws SAXException, IOException
{
// att-value paare mit | getrennt. Innerhalb eines values gibt es auch Zeilenumbrüche (mit '
') - dies gilt als Aufzählung
@@ -673,7 +673,7 @@ protected void parseInfoBox(String strText, Metadata metadata, ContentHandler ha
// als erstes schneiden wir mal die Infobox raus. (?m) ist multiline und (?s) ist dotall ('.' matcht auch line breaks)
int iStartInfoBox = -1;
int iEndInfoBox = -1;
- MatchResult infoMatch = StringUtils.findFirst("\\{\\{\\s*Infobox", strText);
+ MatchResult infoMatch = StringUtils.findFirstMatch("\\{\\{\\s*Infobox", strText);
if(infoMatch != null)
{
iStartInfoBox = infoMatch.start();
diff --git a/src/main/java/de/dfki/km/leech/sax/CrawlReportContentHandler.java b/src/main/java/de/dfki/km/leech/sax/CrawlReportContentHandler.java
index 8d36c17..b64ffd5 100644
--- a/src/main/java/de/dfki/km/leech/sax/CrawlReportContentHandler.java
+++ b/src/main/java/de/dfki/km/leech/sax/CrawlReportContentHandler.java
@@ -28,10 +28,10 @@
import org.apache.tika.metadata.Metadata;
-import de.dfki.inquisition.collections.CollectionUtilz;
-import de.dfki.inquisition.collections.MultiValueTreeMap;
-import de.dfki.inquisition.processes.StopWatch;
-import de.dfki.inquisition.text.StringUtils;
+import de.dfki.inquisitor.collections.CollectionUtilz;
+import de.dfki.inquisitor.collections.MultiValueTreeMap;
+import de.dfki.inquisitor.processes.StopWatch;
+import de.dfki.inquisitor.text.StringUtils;
diff --git a/src/main/java/de/dfki/km/leech/solr/ToSolrContentHandler.java b/src/main/java/de/dfki/km/leech/solr/ToSolrContentHandler.java
index 77b8070..c5da910 100644
--- a/src/main/java/de/dfki/km/leech/solr/ToSolrContentHandler.java
+++ b/src/main/java/de/dfki/km/leech/solr/ToSolrContentHandler.java
@@ -14,8 +14,8 @@
import org.apache.solr.common.SolrInputDocument;
import org.apache.tika.metadata.Metadata;
-import de.dfki.inquisition.collections.MultiValueHashMap;
-import de.dfki.inquisition.text.StringUtils;
+import de.dfki.inquisitor.collections.MultiValueHashMap;
+import de.dfki.inquisitor.text.StringUtils;
import de.dfki.km.leech.metadata.LeechMetadata;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.sax.DataSinkContentHandler;
diff --git a/src/main/java/de/dfki/km/leech/util/IndexPostprocessor.java b/src/main/java/de/dfki/km/leech/util/IndexPostprocessor.java
index fdd4d0d..3eb31f6 100644
--- a/src/main/java/de/dfki/km/leech/util/IndexPostprocessor.java
+++ b/src/main/java/de/dfki/km/leech/util/IndexPostprocessor.java
@@ -14,6 +14,7 @@
import java.util.Set;
import java.util.logging.Logger;
+import de.dfki.km.leech.lucene.basic.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
@@ -26,14 +27,14 @@
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.tika.metadata.Metadata;
-import de.dfki.inquisition.file.FileUtils;
-import de.dfki.inquisition.lucene.Buzzwords;
-import de.dfki.inquisition.lucene.DocumentFrqClass;
-import de.dfki.inquisition.lucene.FieldConfig;
-import de.dfki.inquisition.lucene.LuceneUtilz;
-import de.dfki.inquisition.lucene.PageCountEstimator;
-import de.dfki.inquisition.processes.StopWatch;
-import de.dfki.inquisition.text.StringUtils;
+import de.dfki.inquisitor.file.FileUtilz;
+// import de.dfki.inquisitor.lucene.Buzzwords;
+// import de.dfki.inquisitor.lucene.DocumentFrqClass;
+// import de.dfki.inquisitor.lucene.FieldConfig;
+// import de.dfki.inquisitor.lucene.LuceneUtilz;
+// import de.dfki.inquisitor.lucene.PageCountEstimator;
+import de.dfki.inquisitor.processes.StopWatch;
+import de.dfki.inquisitor.text.StringUtils;
import de.dfki.km.leech.lucene.ToLuceneContentHandler;
import de.dfki.km.leech.metadata.LeechMetadata;
@@ -90,11 +91,6 @@ static protected List terms(String strFieldName, String strPrefix, int i
/**
* Enables the Buzzword creation by setting the related configuration parameters.
- *
- * @param strNewField4Buzzwords
- * @param sAttNames4BuzzwordCalculation
- * @param iMaxNumberOfBuzzwords
- * @param bSkipSimilarTerms
*/
public void enableBuzzwordGeneration(String strNewField4Buzzwords, int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms)
{
@@ -284,8 +280,8 @@ public void postprocessIndex(String strLuceneIndexPath, FieldConfig fieldConfig,
}
// fOurTmpDir.renameTo(fLuceneIndex);
- FileUtils.deleteDirectory(new File(pUnpostProcessed.toString()));
- FileUtils.deleteDirectory(fOurTmpDir.toFile());
+ FileUtilz.deleteDirectory(new File(pUnpostProcessed.toString()));
+ FileUtilz.deleteDirectory(fOurTmpDir.toFile());
diff --git a/src/main/java/de/dfki/km/leech/util/LuceneIndexCreator.java b/src/main/java/de/dfki/km/leech/util/LuceneIndexCreator.java
index 9af0f33..1025ece 100644
--- a/src/main/java/de/dfki/km/leech/util/LuceneIndexCreator.java
+++ b/src/main/java/de/dfki/km/leech/util/LuceneIndexCreator.java
@@ -13,6 +13,7 @@
import java.util.logging.Level;
import java.util.logging.Logger;
+import de.dfki.km.leech.lucene.basic.FieldConfig;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
@@ -23,10 +24,10 @@
import org.apache.tika.parser.ParseContext;
import org.xml.sax.SAXException;
-import de.dfki.inquisition.collections.MultiValueHashMap;
-import de.dfki.inquisition.lucene.FieldConfig;
-import de.dfki.inquisition.processes.StopWatch;
-import de.dfki.inquisition.text.StringUtils;
+import de.dfki.inquisitor.collections.MultiValueHashMap;
+// import de.dfki.inquisitor.lucene.FieldConfig;
+import de.dfki.inquisitor.processes.StopWatch;
+import de.dfki.inquisitor.text.StringUtils;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.lucene.LeechDefaultFieldConfig;
@@ -40,8 +41,7 @@
/**
- * A very simple Lucene Index creator. FieldConfig is from {@link WikipediaDumpParser#getFieldConfig4ParserAttributes()}, currently you can only specify the source
- * dir/file and the target dir for the lucene index
+ * A very simple Lucene Index creator. Currently you can only specify the source dir/file and the target dir for the lucene index
*
* @author Christian Reuschling, Dipl.Ing.(BA)
*
@@ -97,7 +97,7 @@ public static void createIndex(List lUrls2Crawl, String strLuceneIndexPa
Leech leech = new Leech();
- long startTime = StopWatch.startAndLogTime(Level.INFO);
+ long startTime = StopWatch.startAndLogTime(LuceneIndexCreator.class);
CrawlReportContentHandler reportContentHandler;
@@ -145,7 +145,7 @@ public static void createIndex(List lUrls2Crawl, String strLuceneIndexPa
indexWriter.forceMerge(1, true);
indexWriter.close();
- StopWatch.stopAndLogDistance(startTime, Level.INFO);
+ StopWatch.stopAndLogDistance(startTime, LuceneIndexCreator.class);
Logger.getLogger(LuceneIndexCreator.class.getName()).info("..finished crawling " + lUrls2Crawl);
}
diff --git a/src/main/java/de/dfki/km/leech/util/SolrIndexCreator.java b/src/main/java/de/dfki/km/leech/util/SolrIndexCreator.java
index dc76d4b..8ae1cf2 100644
--- a/src/main/java/de/dfki/km/leech/util/SolrIndexCreator.java
+++ b/src/main/java/de/dfki/km/leech/util/SolrIndexCreator.java
@@ -13,8 +13,8 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import de.dfki.inquisition.collections.MultiValueHashMap;
-import de.dfki.inquisition.processes.StopWatch;
+import de.dfki.inquisitor.collections.MultiValueHashMap;
+import de.dfki.inquisitor.processes.StopWatch;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.WikipediaDumpParserConfig;
@@ -76,7 +76,7 @@ public void createIndex(List lUrls2Crawl, String strSolrUrl, MultiValueH
Leech leech = new Leech();
- long startTime = StopWatch.startAndLogTime(Level.INFO);
+ long startTime = StopWatch.startAndLogTime(SolrIndexCreator.class);
CrawlReportContentHandler reportContentHandler;
@@ -123,7 +123,7 @@ public void createIndex(List lUrls2Crawl, String strSolrUrl, MultiValueH
leech.parse(lUrls2Crawl.toArray(new String[0]), finalContentHandler, context);
- StopWatch.stopAndLogDistance(startTime, Level.INFO);
+ StopWatch.stopAndLogDistance(startTime, SolrIndexCreator.class);
}
diff --git a/src/main/java/de/dfki/km/leech/util/TikaUtils.java b/src/main/java/de/dfki/km/leech/util/TikaUtils.java
index 0ff3145..9e2f340 100644
--- a/src/main/java/de/dfki/km/leech/util/TikaUtils.java
+++ b/src/main/java/de/dfki/km/leech/util/TikaUtils.java
@@ -29,7 +29,7 @@
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
-import de.dfki.inquisition.text.StringUtils;
+import de.dfki.inquisitor.text.StringUtils;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.parser.DirectoryCrawlerParser;