diff --git a/.gitignore b/.gitignore index 62eb210..3154aea 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,8 @@ target/ luceneIndex_wikipedia/ testIndex/ bla.txt +.idea/ +leechcrawler.iml +testIndex_4PostProcessing/ -# Except this file -#!.gitignore diff --git a/pom.xml b/pom.xml index f4ea186..4ed9abe 100644 --- a/pom.xml +++ b/pom.xml @@ -1,345 +1,252 @@ - 4.0.0 - de.dfki.km - leechcrawler - jar - 1.11.1 - leechcrawler - http://leechcrawler.github.com/leech/ - - - - Christian Reuschling - reuschling@dfki.uni-kl.de - DFKI, KnowledgeManagement - http://www.dfki.de/web/forschung/km - - - - - 1.7 - 1.7 - - UTF-8 - UTF-8 - UTF-8 - - - - - - - - - - - - - - artifactory-libs-releases-local - http://www.dfki.uni-kl.de/artifactory/libs-releases-local - - - artifactory-libs-snapshots-local - http://www.dfki.uni-kl.de/artifactory/libs-snapshots-local - false - - - - - - - - junit - junit - 4.8.2 - test - - - - org.apache.tika - tika-core - 1.14 - - - - org.apache.tika - tika-parsers - 1.14 - - - - org.apache.lucene - lucene-core - 6.4.2 - - - - org.apache.lucene - lucene-analyzers-common - 6.4.2 - - - - - info.bliki.wiki - bliki-core - 3.0.19 - - - - javax.mail - mail - 1.4.5 - - - - - org.apache.commons - commons-lang3 - 3.3.2 - - - - - org.mapdb - mapdb - - 2.0-beta11 - - - - com.cedarsoftware - json-io - 2.9.3 - - - - de.dfki.km - inquisition - 20151124 - - - - - org.apache.solr - solr-solrj - 5.2.1 - - - - - - - - - - - - - - - org.codehaus.mojo - appassembler-maven-plugin - 1.2.2 - - ${project.build.directory}/assembleDir - -Xmx1G - bin - lib - flat - true - true - - - - .sh - - - - de.dfki.km.leech.util.LuceneIndexCreator - createLuceneIndex - - - de.dfki.km.leech.util.SolrIndexCreator - createSolrIndex - - - de.dfki.km.leech.Leech - leech - - - - - - - - - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 2.3.2 - - ${maven.compiler.source} - ${maven.compiler.target} - ${maven.compiler.encoding} - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.5 - - ${maven.compiler.encoding} - true - - - - org.apache.maven.plugins - maven-resources-plugin - 2.5 - - ${maven.compiler.encoding} - - - - org.apache.maven.plugins - maven-source-plugin - 2.1.2 - - - org.apache.maven.plugins - maven-deploy-plugin - 2.5 - - - org.apache.maven.plugins - maven-install-plugin - 2.3.1 - - - org.apache.maven.plugins - maven-release-plugin - 2.2 - - - - maven-assembly-plugin - 2.3 - - - src/main/assembly/distributable.xml - - - - - - - - - - - - - - javadoc - - true - - javadoc - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - javadoc-jar - package - - jar - - - - - - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.5 - - - javadoc - - aggregate-jar - - - - - - - - - - - - package-source - - true - - package-source - - - - - - - maven-source-plugin - 2.1.2 - - - package-source-jar - package - - jar - - - - - - - - - - package-project - - - - org.apache.maven.plugins - maven-assembly-plugin - 2.3 - - - - - - - + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + 4.0.0 + de.dfki.sds + leechcrawler + jar + 1.25.0 + leechcrawler + http://leechcrawler.github.com/leech/ + + + + Christian Reuschling + reuschling@dfki.uni-kl.de + DFKI, SDS department + https://www.dfki.de/en/web/research/research-departments/smart-data-knowledge-services/ + + + + + 1.7 + 1.7 + + UTF-8 + UTF-8 + UTF-8 + + -Xdoclint:none + + + + + + artifactory-libs-releases-local + http://www.dfki.uni-kl.de/artifactory/libs-releases-local + + + artifactory-libs-snapshots-local + http://www.dfki.uni-kl.de/artifactory/libs-snapshots-local + + + + + + + artifactory-libs-releases + http://www.dfki.uni-kl.de/artifactory/libs-releases + + + artifactory-libs-snapshots + http://www.dfki.uni-kl.de/artifactory/libs-snapshots + + + + + + + junit + junit + 4.8.2 + test + + + + + + org.apache.tika + tika-parsers + 1.25 + + + + com.github.jai-imageio + jai-imageio-jpeg2000 + 1.4.0 + + + + + + org.apache.lucene + lucene-core + 6.4.2 + + + + org.apache.lucene + lucene-analyzers-common + 6.4.2 + + + + org.apache.lucene + lucene-queryparser + 6.4.2 + + + + + info.bliki.wiki + bliki-core + + 3.1.0 + + + + com.sun.mail + javax.mail + 1.6.2 + + + + + org.apache.commons + commons-lang3 + 3.11 + + + + + org.mapdb + mapdb + 3.0.8 + + + + com.cedarsoftware + json-io + 2.9.3 + + + + de.dfki.sds + inquisitor + 23_1-SNAPSHOT + + + + + org.apache.solr + solr-solrj + 5.2.1 + + + + + + + + + + + + org.codehaus.mojo + appassembler-maven-plugin + 2.1.0 + + ${project.build.directory}/assembleDir + -Xmx5G --add-opens java.base/java.lang=ALL-UNNAMED + /bin + lib + flat + true + true + + + + .sh + + + + de.dfki.km.leech.util.LuceneIndexCreator + createLuceneIndex + + + de.dfki.km.leech.util.SolrIndexCreator + createSolrIndex + + + de.dfki.km.leech.Leech + leechcrawler + + + + + + + + + org.apache.maven.plugins + maven-source-plugin + 3.2.1 + + + attach-sources + + jar + + + + + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + ${maven.compiler.source} + ${maven.compiler.target} + ${maven.compiler.encoding} + + + -parameters + + + + + + maven-assembly-plugin + 2.3 + + + src/main/assembly/distributable.xml + + + + + + + + + + + + disable-java8-doclint + + [1.8,) + + + -Xdoclint:none + + + + diff --git a/src/main/java/de/dfki/km/leech/Leech.java b/src/main/java/de/dfki/km/leech/Leech.java index fc1a898..0f77826 100644 --- a/src/main/java/de/dfki/km/leech/Leech.java +++ b/src/main/java/de/dfki/km/leech/Leech.java @@ -40,7 +40,7 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import de.dfki.inquisition.text.StringUtils; +import de.dfki.inquisitor.text.StringUtils; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.config.DirectoryCrawlerContext; import de.dfki.km.leech.config.LeechConfig; diff --git a/src/main/java/de/dfki/km/leech/config/CrawlerContext.java b/src/main/java/de/dfki/km/leech/config/CrawlerContext.java index fffcda2..1db187f 100644 --- a/src/main/java/de/dfki/km/leech/config/CrawlerContext.java +++ b/src/main/java/de/dfki/km/leech/config/CrawlerContext.java @@ -89,7 +89,7 @@ public class CrawlerContext protected Map m_userHeaders = null; /** - * Creates a new ParseContext Object with an entry with this {@link #CrawlerContext} configuration. This method is only for convenience. + * Creates a new ParseContext Object with an entry with this {@link CrawlerContext} configuration. This method is only for convenience. * * @return the created ParseContext Object. */ diff --git a/src/main/java/de/dfki/km/leech/io/ImapURLStreamProvider.java b/src/main/java/de/dfki/km/leech/io/ImapURLStreamProvider.java index 5ee6b69..92ef3d1 100644 --- a/src/main/java/de/dfki/km/leech/io/ImapURLStreamProvider.java +++ b/src/main/java/de/dfki/km/leech/io/ImapURLStreamProvider.java @@ -18,6 +18,17 @@ +import com.sun.mail.imap.IMAPFolder; +import com.sun.mail.imap.IMAPMessage; +import de.dfki.km.leech.detect.DatasourceMediaTypes; +import de.dfki.km.leech.parser.ImapCrawlerParser; +import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; +import de.dfki.km.leech.util.UrlUtil; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; + +import javax.mail.*; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; @@ -27,24 +38,6 @@ import java.util.logging.Level; import java.util.logging.Logger; -import javax.mail.Folder; -import javax.mail.Message; -import javax.mail.MessagingException; -import javax.mail.Store; -import javax.mail.URLName; - -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; - -import com.sun.mail.imap.IMAPFolder; -import com.sun.mail.imap.IMAPMessage; - -import de.dfki.km.leech.detect.DatasourceMediaTypes; -import de.dfki.km.leech.parser.ImapCrawlerParser; -import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; -import de.dfki.km.leech.util.UrlUtil; - public class ImapURLStreamProvider extends URLStreamProvider diff --git a/src/main/java/de/dfki/km/leech/lucene/LeechDefaultFieldConfig.java b/src/main/java/de/dfki/km/leech/lucene/LeechDefaultFieldConfig.java index b07e5c5..bd319e0 100644 --- a/src/main/java/de/dfki/km/leech/lucene/LeechDefaultFieldConfig.java +++ b/src/main/java/de/dfki/km/leech/lucene/LeechDefaultFieldConfig.java @@ -2,8 +2,13 @@ -import de.dfki.inquisition.lucene.DynamicFieldType; -import de.dfki.inquisition.lucene.FieldConfig; +// import de.dfki.inquisitor.lucene.DynamicFieldType; +// import de.dfki.inquisitor.lucene.FieldConfig; + + + +import de.dfki.km.leech.lucene.basic.DynamicFieldType; +import de.dfki.km.leech.lucene.basic.FieldConfig; diff --git a/src/main/java/de/dfki/km/leech/lucene/LeechSimpleAnalyzer.java b/src/main/java/de/dfki/km/leech/lucene/LeechSimpleAnalyzer.java new file mode 100644 index 0000000..0180ffd --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/LeechSimpleAnalyzer.java @@ -0,0 +1,40 @@ +package de.dfki.km.leech.lucene; + +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.LowerCaseTokenizer; +import org.apache.lucene.util.Version; + + + +/** + * An {@link Analyzer} that filters {@link LetterOrDigitLowerCaseTokenizer} with {@link LowerCaseFilter} + **/ +public class LeechSimpleAnalyzer extends Analyzer +{ + + static final protected LeechSimpleAnalyzer m_singelton = new LeechSimpleAnalyzer(); + + static public LeechSimpleAnalyzer getSingleton() + { + return m_singelton; + } + + + /** + * Creates a new {@link LeechSimpleAnalyzer} + */ + public LeechSimpleAnalyzer() + { + } + + + + @Override + protected TokenStreamComponents createComponents(String fieldName) + { + return new TokenStreamComponents(new LetterOrDigitLowerCaseTokenizer()); + } +} diff --git a/src/main/java/de/dfki/km/leech/lucene/LetterOrDigitLowerCaseTokenizer.java b/src/main/java/de/dfki/km/leech/lucene/LetterOrDigitLowerCaseTokenizer.java new file mode 100644 index 0000000..298ab6a --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/LetterOrDigitLowerCaseTokenizer.java @@ -0,0 +1,55 @@ +package de.dfki.km.leech.lucene; + + + +import org.apache.lucene.analysis.util.CharTokenizer; +import org.apache.lucene.util.AttributeFactory; + + + +/** + * Tokenizer that tokenizes between letter and digit entries. The chars will also be converted to lower case. + *

+ * Note: this does a decent job for most European languages, but does a terrible job for some Asian languages, where words maybe are not separated by + * spaces, etc. + * + * @author Christian Reuschling, Dipl.Ing.(BA) + */ +public class LetterOrDigitLowerCaseTokenizer extends CharTokenizer +{ + + public LetterOrDigitLowerCaseTokenizer(AttributeFactory factory) + { + super(factory); + } + + + + public LetterOrDigitLowerCaseTokenizer() + { + super(); + } + + + + + /** + * Collects only characters which satisfy {@link Character#isLetterOrDigit(int)}. + */ + @Override + protected boolean isTokenChar(int c) + { + return Character.isLetterOrDigit(c); + } + + + + /** + * Converts char to lower case {@link Character#toLowerCase(int)}. + */ + @Override + protected int normalize(int c) + { + return Character.toLowerCase(c); + } +} diff --git a/src/main/java/de/dfki/km/leech/lucene/ToLuceneContentHandler.java b/src/main/java/de/dfki/km/leech/lucene/ToLuceneContentHandler.java index 6b9207c..b4e5ada 100644 --- a/src/main/java/de/dfki/km/leech/lucene/ToLuceneContentHandler.java +++ b/src/main/java/de/dfki/km/leech/lucene/ToLuceneContentHandler.java @@ -1,16 +1,16 @@ /* * Leech - crawling capabilities for Apache Tika - * + * * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling - * + * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, * either version 3 of the License, or (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License along with this program. If not, see . - * + * * Contact us by mail: christian.reuschling@dfki.de */ @@ -18,60 +18,47 @@ +import de.dfki.inquisitor.collections.MultiValueHashMap; +import de.dfki.inquisitor.file.FileUtilz; +// import de.dfki.inquisitor.lucene.FieldConfig; +import de.dfki.km.leech.Leech; +import de.dfki.km.leech.lucene.basic.FieldConfig; +import de.dfki.km.leech.metadata.LeechMetadata; +import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; +import de.dfki.km.leech.sax.DataSinkContentHandler; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.*; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.store.SimpleFSDirectory; +import org.apache.tika.metadata.Metadata; + import java.io.File; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.rmi.server.UID; -import java.util.Collection; -import java.util.Collections; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.Map.Entry; -import java.util.UUID; import java.util.concurrent.BlockingQueue; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.LinkedBlockingQueue; import java.util.logging.Level; import java.util.logging.Logger; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.index.IndexableField; -import org.apache.lucene.index.Term; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.LockObtainFailedException; -import org.apache.lucene.store.SimpleFSDirectory; -import org.apache.lucene.util.Version; -import org.apache.tika.metadata.Metadata; - -import de.dfki.inquisition.collections.MultiValueHashMap; -import de.dfki.inquisition.file.FileUtils; -import de.dfki.inquisition.lucene.FieldConfig; -import de.dfki.km.leech.Leech; -import de.dfki.km.leech.metadata.LeechMetadata; -import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; -import de.dfki.km.leech.sax.DataSinkContentHandler; - /** * This is a content handler that allows to store crawled data into a Lucene index. You are able to configure the field types and the analyzers that should be used. - * Further, blockindexing with {@link IndexWriter#addDocuments(java.util.Collection, Analyzer)} is supported, you can enable it with + * Further, blockindexing with {@link IndexWriter#addDocuments(Iterable)} is supported, you can enable it with * {@link ToLuceneContentHandler#setBlockIndexing(boolean)}. If it is enabled, {@link ToLuceneContentHandler} checks whether inside the metadata is a * {@link LeechMetadata#childId} or a {@link LeechMetadata#parentId} key. Documents with a {@link LeechMetadata#childId} entry will appear as parent documents, docs with * an {@link LeechMetadata#parentId} as childs. {@link ToLuceneContentHandler} collects the child documents if they appear at a processXXX method, and writes them as * block at the time a succeeding parent document appears. In the case a non-parent doc appears, all collected docs will be indexed normally, not as block. - * + * * @author Christian Reuschling, Dipl.Ing.(BA) - * */ public class ToLuceneContentHandler extends DataSinkContentHandler { @@ -90,7 +77,7 @@ public void run() { List llDocs = m_addDocsQueue.take(); - if(llDocs instanceof InterruptThreadList) + if (llDocs instanceof InterruptThreadList) { break; } @@ -99,48 +86,36 @@ public void run() { - if(llDocs.size() == 1) + if (llDocs.size() == 1) { getCurrentWriter().addDocument(llDocs.get(0)); } - else if(llDocs.size() > 1) + else if (llDocs.size() > 1) { getCurrentWriter().addDocuments(llDocs); } - - } - catch (Exception e) + } catch (Exception e) { - Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log( - Level.WARNING, - "Error during writing a document to the index (lucene exception while addDocument) - will ignore it. This is a hint to a lucene bug." - + llDocs); + Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.WARNING, + "Error during writing a document to the index (lucene exception while addDocument) - will ignore it. This is a hint to a lucene bug." + llDocs); } - } - } - catch (InterruptedException e) + } catch (InterruptedException e) { // NOP - } - catch (Exception e) + } catch (Exception e) { Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.SEVERE, "Error", e); - - } - finally + } finally { try { m_cyclicBarrier4DocConsumerThreads.await(); - } - catch (Exception e2) + } catch (Exception e2) { Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.SEVERE, "Error", e2); } } - - } } @@ -266,6 +241,21 @@ public ToLuceneContentHandler(Metadata metadata, int writeLimit, FieldConfig fie + protected void addStaticAttValuePairs(Document doc) throws Exception + { + for (Entry fieldName2Value : getStaticAttributeValuePairs().entryList()) + { + IndexableField field = m_fieldConfig.createField(fieldName2Value.getKey(), fieldName2Value.getValue()); + if (field != null) + doc.add(field); + else + Logger.getLogger(ToLuceneContentHandler.class.getName()) + .warning("Could not create lucene field for " + fieldName2Value.getKey() + ":" + fieldName2Value.getValue() + ". Will ignore it."); + } + } + + + /** * Will merge all temporar indices together into the initial indexWriter index. This is only necessary if SplitAndMerge is enabled. Otherwise you don't have to invoke * this method. @@ -283,12 +273,13 @@ public void crawlFinished() m_llConsumerThreads.clear(); - if(getSplitAndMergeIndex() <= 0) return; + if (getSplitAndMergeIndex() <= 0) + return; // hier mergen wir nun alle temporären indices in den originalen // der temporären müssen noch geschlossen werden - das machen wir jetzt. Der letzte steht noch nicht in der Liste - if(m_luceneWriter != m_initialLuceneWriter) + if (m_luceneWriter != m_initialLuceneWriter) { for (IndexWriter writer2close : m_llIndexWriter2Close) writer2close.close(); @@ -300,7 +291,8 @@ public void crawlFinished() for (String strTmpPath : m_hsTmpLuceneWriterPaths2Merge) llIndicesDirs2Merge.add(new SimpleFSDirectory(Paths.get(strTmpPath))); - if(llIndicesDirs2Merge.size() == 0) return; + if (llIndicesDirs2Merge.size() == 0) + return; Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Will merge " + llIndicesDirs2Merge.size() + " temporary indices to the final one."); @@ -310,15 +302,175 @@ public void crawlFinished() m_initialLuceneWriter.commit(); for (String strTmpPath : m_hsTmpLuceneWriterPaths2Merge) - FileUtils.deleteDirectory(new File(strTmpPath)); + FileUtilz.deleteDirectory(new File(strTmpPath)); + } catch (Exception e) + { + Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e); + } + } + + + + /** + * Returns null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)}) + * + * @param metadata + * @param strFulltext + * + * @return null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)}) + * + * @throws Exception + */ + protected Document createAndFillLuceneDocument(Metadata metadata, String strFulltext) throws Exception + { + // // wir erstellen kein Document-Object neu, wenn es nicht unbedingt nötig ist - dazu merken wir uns die Referenzen auf die schon allokierten + // // Document Objekte + // // Document Object reuse + // Document doc = null; + // for (Document preAllocatedDoc : m_llAllocatedDocuments) + // { + // if(!m_llLastChildDocuments.contains(preAllocatedDoc)) + // { + // doc = preAllocatedDoc; + // LinkedList llFieldNames = new + // for (Fieldable field : doc.getFields()) + // doc.removeFields(field.name()); + // + // break; + // } + // } + // if(doc == null) + // { + // doc = new Document(); + // m_llAllocatedDocuments.add(doc); + // } + + Document doc = new Document(); + + // Das man kein Field aus einem reader machen kann ist der Grund, warum processNewMetaData den Fulltext als String und nicht als reader + // übergibt + + // eine eindeutige ID muß da sein + if (metadata.getValues(LeechMetadata.id).length == 0) + doc.add(m_fieldConfig.createField(LeechMetadata.id, new UID().toString())); + if (!getFields2Ignore().contains(LeechMetadata.body)) + doc.add(m_fieldConfig.createField(LeechMetadata.body, strFulltext)); + // die kopien + for (String strFieldCopy : getFieldCopyMap().get(LeechMetadata.body)) + if (!getFields2Ignore().contains(strFieldCopy)) + doc.add(m_fieldConfig.createField(strFieldCopy, strFulltext)); + + + // die restlichen metadaten + for (String strFieldName : metadata.names()) + { + if (!getFields2Ignore().contains(strFieldName)) + { + for (String strValue : metadata.getValues(strFieldName)) + { + IndexableField field = m_fieldConfig.createField(strFieldName, strValue); + if (field != null) + doc.add(field); + else + Logger.getLogger(ToLuceneContentHandler.class.getName()) + .warning("Could not create lucene field for " + strFieldName + ":" + strValue + ". Will ignore it."); + } + } + + // die kopien + for (String strFieldCopy : getFieldCopyMap().get(strFieldName)) + if (!getFields2Ignore().contains(strFieldCopy)) + { + for (String strValue : metadata.getValues(strFieldName)) + { + IndexableField field = m_fieldConfig.createField(strFieldCopy, strValue); + if (field != null) + doc.add(field); + else + Logger.getLogger(ToLuceneContentHandler.class.getName()) + .warning("Could not create lucene field for " + strFieldCopy + ":" + strValue + ". Will ignore it."); + } + } } - catch (Exception e) + + // die statischen Attribut-Value-Paare + addStaticAttValuePairs(doc); + + // und jetzt aggregieren wir noch + for (String strTargetAtt : getFieldAggregationMap().keySet()) { - Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e); + // wenn es das TargetAtt schon im doc gibt, dann aggregieren wir nix + if (doc.get(strTargetAtt) != null) + continue; + + Collection colSourceAtts = getFieldAggregationMap().get(strTargetAtt); + + for (String strSourceAtt : colSourceAtts) + { + String strNewValue = metadata.get(strSourceAtt); + if (strNewValue == null) + strNewValue = getStaticAttributeValuePairs().getFirst(strSourceAtt); + + if (strNewValue != null) + { + IndexableField field = m_fieldConfig.createField(strTargetAtt, strNewValue); + if (field != null) + doc.add(field); + else + Logger.getLogger(ToLuceneContentHandler.class.getName()) + .warning("Could not create lucene field for " + strTargetAtt + ":" + strNewValue + ". Will ignore it."); + + break; + } + } + } + + + + // wenn ein Doc nicht unseren constraints entspricht, dann ignorieren wir das hier, indem wir null zurück geben + if (m_hsFieldName2FieldValueConstraint == null || m_hsFieldName2FieldValueConstraint.size() == 0) + return doc; + + for (Entry fieldname2fieldValRegEx : m_hsFieldName2FieldValueConstraint.entrySet()) + { + IndexableField[] fieldables = doc.getFields(fieldname2fieldValRegEx.getKey()); + for (IndexableField fieldable : fieldables) + { + String strVal = fieldable.stringValue(); + if (strVal.matches(fieldname2fieldValRegEx.getValue())) + { + // wir haben einen Treffer + return doc; + } + } } + + return null; + } + + + + protected void ensureConsumerThreadsRunning() + { + if (m_llConsumerThreads.size() != 0) + return; + + int iCoreCount = Runtime.getRuntime().availableProcessors(); + int iThreadCount = (int) Math.round(iCoreCount / 2d); + iThreadCount = Math.max(iThreadCount, 1); + + m_cyclicBarrier4DocConsumerThreads = new CyclicBarrier(iThreadCount + 1); + for (int i = 0; i < iThreadCount; i++) + { + Thread consumerThread = new Thread(new DocConsumer(), "ToLuceneContentHandlerDocConsumer " + i); + m_llConsumerThreads.add(consumerThread); + consumerThread.setDaemon(true); + + consumerThread.start(); + } } @@ -330,11 +482,58 @@ public boolean getBlockIndexing() + synchronized protected IndexWriter getCurrentWriter() throws CorruptIndexException, LockObtainFailedException, IOException + { + + + if (getSplitAndMergeIndex() <= 0) + return m_initialLuceneWriter; + + if (m_luceneWriter.maxDoc() < getSplitAndMergeIndex()) + return m_luceneWriter; + + + Directory directory = m_initialLuceneWriter.getDirectory(); + + Path fOurTmpDir = null; + if (directory instanceof FSDirectory) + { + if (m_luceneWriter != m_initialLuceneWriter) + m_llIndexWriter2Close.add(m_luceneWriter); + + String strTmpPath = ((FSDirectory) directory).getDirectory().toAbsolutePath().toString(); + // if(strTmpPath.charAt(strTmpPath.length() - 1) == '/' || strTmpPath.charAt(strTmpPath.length() - 1) == '\\') + // strTmpPath = strTmpPath.substring(0, strTmpPath.length() - 1); + strTmpPath += "_" + (m_hsTmpLuceneWriterPaths2Merge.size() + 1); + fOurTmpDir = Paths.get(strTmpPath); + } + else + { + // wir brauchen was temporäres + File parentDir = new File(System.getProperty("java.io.tmpdir")); + fOurTmpDir = Paths.get(parentDir.getAbsolutePath() + "/leechTmp/" + UUID.randomUUID().toString().replaceAll("\\W", "_")); + } + + Logger.getLogger(ToLuceneContentHandler.class.getName()) + .info("Current index exceeds " + m_iSplitIndexDocumentCount + " documents. Will create another temporary one under " + fOurTmpDir); + + + @SuppressWarnings("deprecation") IndexWriterConfig config = new IndexWriterConfig(m_initialLuceneWriter.getConfig().getAnalyzer()); + config.setOpenMode(OpenMode.CREATE); + + m_luceneWriter = new IndexWriter(new SimpleFSDirectory(fOurTmpDir), config); + m_hsTmpLuceneWriterPaths2Merge.add(fOurTmpDir.toAbsolutePath().toString()); + + return m_luceneWriter; + } + + + /** * Gets the field aggregation map. This means that you want to generate a field entry, whereby its value should be copied from another, existing metadata entry. You * can specify a list of these source-attributes, the first who have an entry wins and appears as new attribute, so the source field name list is in fact a priorized * list. - * + * * @return the current field aggregation map */ public MultiValueHashMap getFieldAggregationMap() @@ -346,7 +545,7 @@ public MultiValueHashMap getFieldAggregationMap() /** * Gets the field config - * + * * @return the field config */ public FieldConfig getFieldConfig() @@ -360,7 +559,7 @@ public FieldConfig getFieldConfig() * Gets the field copy mappings. This means that the content of every metadata key that is specified as key inside hsSource2TargetFieldnames will be copied into * several other fields. The field names of these fields are specified as corresponding value inside hsSource2TargetFieldnames. In the case you want to rename * attribute names, specify a field mapping and ignore the source field name with {@link #setFieldNames2Ignore(HashSet)} - * + * * @return the current field mappings */ public MultiValueHashMap getFieldCopyMap() @@ -372,7 +571,7 @@ public MultiValueHashMap getFieldCopyMap() /** * Gets the set of field names / metadata key values that will NOT be stored into the lucene index. - * + * * @return the set of field names / metadata key values that will NOT be stored into the lucene index. */ public HashSet getFields2Ignore() @@ -384,7 +583,7 @@ public HashSet getFields2Ignore() /** * All docs without at least one of the given fieldname-value pairs will be ignored. You can specif regular expressions as field values - * + * * @return the fieldname-value pairs. At least one have to match that a document will be written into the index */ public Map getIgnoreAllDocsWithout() @@ -400,7 +599,7 @@ public Map getIgnoreAllDocsWithout() * writing, until this one also gets 'overfilled'. In the case your crawl is finished, {@link Leech} invokes {@link ToLuceneContentHandler#crawlFinished()}. This will * merge all temporary indices into the initial indexWriter object. This is for performance reasons because writing into a Lucene index tends to get slow after a * certain size. Splitting and merging afterwards is faster. - * + * * @return the document count a new index will be created */ public int getSplitAndMergeIndex() @@ -412,7 +611,7 @@ public int getSplitAndMergeIndex() /** * Sets some attribute value pairs that will be added to every crawled document. - * + * * @return the current static attribute value pairs */ public MultiValueHashMap getStaticAttributeValuePairs() @@ -422,6 +621,16 @@ public MultiValueHashMap getStaticAttributeValuePairs() + @Override + protected void init() + { + Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Will write crawled data into " + m_luceneWriter.getDirectory().toString()); + + ensureConsumerThreadsRunning(); + } + + + @Override public void processErrorData(Metadata metadata) { @@ -439,20 +648,17 @@ public void processModifiedData(Metadata metadata, String strFulltext) // hier modifizieren wir ein schon vorhandenes Dokument Document luceneDocument = createAndFillLuceneDocument(metadata, strFulltext); - if(luceneDocument == null) return; + if (luceneDocument == null) + return; // TODO: was passiert hier mit block-indexierten Dokumenten? - m_initialLuceneWriter - .updateDocument(new Term(IncrementalCrawlingHistory.dataEntityId, metadata.get(IncrementalCrawlingHistory.dataEntityId)), luceneDocument); - - } - catch (Exception e) + m_initialLuceneWriter.updateDocument(new Term(IncrementalCrawlingHistory.dataEntityId, metadata.get(IncrementalCrawlingHistory.dataEntityId)), luceneDocument); + } catch (Exception e) { Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error during writing into the index", e); } - } @@ -463,7 +669,8 @@ public void processNewData(Metadata metadata, String strFulltext) try { - if(m_initialLuceneWriter == null) throw new IllegalStateException("Lucene writer was not specified"); + if (m_initialLuceneWriter == null) + throw new IllegalStateException("Lucene writer was not specified"); m_luceneWriter = getCurrentWriter(); @@ -471,7 +678,8 @@ public void processNewData(Metadata metadata, String strFulltext) Document doc = createAndFillLuceneDocument(metadata, strFulltext); - if(doc == null) return; + if (doc == null) + return; @@ -480,16 +688,16 @@ public void processNewData(Metadata metadata, String strFulltext) // - wenn wir auf ein Doc ohne parent-oder child-Id stossen, dann schreiben wir alle bisherigen Docs als Einzeldokumente raus - nicht im // Block - if(ToLuceneContentHandler.this.getBlockIndexing()) + if (ToLuceneContentHandler.this.getBlockIndexing()) { - if(metadata.get(LeechMetadata.parentId) != null) + if (metadata.get(LeechMetadata.parentId) != null) { // wir haben ein child-Doc (wir haben eine Referenz zu unserem parent). Das merken wir uns einfach m_llLastChildDocuments.add(doc); } - else if(metadata.get(LeechMetadata.childId) != null) + else if (metadata.get(LeechMetadata.childId) != null) { // wir haben ein parentDoc (ein parent hat min eine childId) - wir schreiben zusammen mit den bisher gesammelten im block. Das // parentDoc ist das letzte @@ -507,24 +715,15 @@ else if(metadata.get(LeechMetadata.childId) != null) m_addDocsQueue.put(Collections.singletonList(doc)); } - - } else { m_addDocsQueue.put(Collections.singletonList(doc)); } - - - - - - } - catch (Exception e) + } catch (Exception e) { Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e); } - } @@ -534,13 +733,15 @@ public void processNewDocument(Document doc) try { - if(m_initialLuceneWriter == null) throw new IllegalStateException("Lucene writer was not specified"); + if (m_initialLuceneWriter == null) + throw new IllegalStateException("Lucene writer was not specified"); m_luceneWriter = getCurrentWriter(); ensureConsumerThreadsRunning(); - if(doc == null) return; + if (doc == null) + return; @@ -549,16 +750,16 @@ public void processNewDocument(Document doc) // - wenn wir auf ein Doc ohne parent-oder child-Id stossen, dann schreiben wir alle bisherigen Docs als Einzeldokumente raus - nicht im // Block - if(ToLuceneContentHandler.this.getBlockIndexing()) + if (ToLuceneContentHandler.this.getBlockIndexing()) { - if(doc.get(LeechMetadata.parentId) != null) + if (doc.get(LeechMetadata.parentId) != null) { // wir haben ein child-Doc (wir haben eine Referenz zu unserem parent). Das merken wir uns einfach m_llLastChildDocuments.add(doc); } - else if(doc.get(LeechMetadata.childId) != null) + else if (doc.get(LeechMetadata.childId) != null) { // wir haben ein parentDoc (ein parent hat min eine childId) - wir schreiben zusammen mit den bisher gesammelten im block. Das // parentDoc ist das letzte @@ -576,24 +777,15 @@ else if(doc.get(LeechMetadata.childId) != null) m_addDocsQueue.put(Collections.singletonList(doc)); } - - } else { m_addDocsQueue.put(Collections.singletonList(doc)); } - - - - - - } - catch (Exception e) + } catch (Exception e) { Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e); } - } @@ -616,18 +808,14 @@ public void processRemovedData(Metadata metadata) // TODO: was passiert hier mit block-indexierten Dokumenten? m_initialLuceneWriter.deleteDocuments(new Term(IncrementalCrawlingHistory.dataEntityId, metadata.get(IncrementalCrawlingHistory.dataEntityId))); - - } - catch (Exception e) + } catch (Exception e) { Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error during writing into the index", e); } - } - @Override public void processUnmodifiedData(Metadata metadata) { @@ -637,12 +825,12 @@ public void processUnmodifiedData(Metadata metadata) /** - * Sets whether block indexing with {@link IndexWriter#addDocuments(java.util.Collection, Analyzer)} is enabled or not. If it is enabled, + * Sets whether block indexing with {@link IndexWriter#addDocuments(Iterable)} is enabled or not. If it is enabled, * {@link ToLuceneContentHandler} checks whether inside the metadata is a {@link LeechMetadata#childId} or a {@link LeechMetadata#parentId} key. Documents with a * {@link LeechMetadata#childId} entry will appear as parent documents, docs with an {@link LeechMetadata#parentId} as childs. {@link ToLuceneContentHandler} collects * the child documents if they appear at a processXXX method, and writes them as block at the time a succeeding parent document appears. In the case a non-parent doc * appears, all collected docs will be indexed normally, not as block. - * + * * @param blockIndexing true in the case blockindexing should be inabled, false otherwise. */ public void setBlockIndexing(boolean blockIndexing) @@ -656,7 +844,7 @@ public void setBlockIndexing(boolean blockIndexing) * Sets the field aggregation map. This means that you want to generate a field entry, whereby its value should be copied from another, existing metadata entry. You * can specify a list of these source-attributes, the first who have an entry wins and appears as new attribute, so the source field name list is in fact a priorized * list. - * + * * @param hsTarget2SourcesFieldnames the field aggregation map */ public void setFieldAggregationMap(MultiValueHashMap hsTarget2SourcesFieldnames) @@ -666,16 +854,13 @@ public void setFieldAggregationMap(MultiValueHashMap hsTarget2So - - - /** * Sets the field copy mappings. This means that the content of every metadata key that is specified as key inside hsSource2TargetFieldnames will be copied into * several other fields. The field names of these fields are specified as corresponding value inside hsSource2TargetFieldnames. In the case you want to rename * attribute names, specify a field mapping and ignore the source field name with {@link #setFieldNames2Ignore(HashSet)} - * + * * @param hsSource2TargetFieldnames keys: source field names, given as metadata keys. values: target field names - the content will also appear under these fields - * inside a lucene document + * inside a lucene document */ public void setFieldCopyMap(MultiValueHashMap hsSource2TargetFieldnames) { @@ -687,7 +872,7 @@ public void setFieldCopyMap(MultiValueHashMap hsSource2TargetFie /** * Sets the set of field names / metadata key values that will NOT be stored into the lucene index. Nevertheless, you can consider these in * {@link #setFieldCopyMap(MultiValueHashMap)}. In this case you have 'moved' the attribute value into another attribute (or several ones). - * + * * @param hsAttNamesNot2Store the set of attribute/field names that will not stored into the lucene index */ public void setFieldNames2Ignore(HashSet hsAttNamesNot2Store) @@ -700,9 +885,9 @@ public void setFieldNames2Ignore(HashSet hsAttNamesNot2Store) /** * All docs without at least one of the given fieldname-value pairs will be ignored. You can specif regular expressions as field values. If this is set to null or to * an empty map, all documents will be accepted. - * + * * @param hsFieldName2FieldValue the fieldname-value pairs. At least one have to match that a document will be written into the index - * + * * @return this */ public ToLuceneContentHandler setIgnoreAllDocsWithout(Map hsFieldName2FieldValue) @@ -714,7 +899,6 @@ public ToLuceneContentHandler setIgnoreAllDocsWithout(Map hsFiel - /** * If split and merge is enabled, {@link ToLuceneContentHandler} will check at each {@link #processNewData(Metadata, String)} invocation whether the current * indexWriter has more than iSplitIndexDocumentCount documents. In the case it has more, {@link ToLuceneContentHandler} will create an entirely new index for @@ -722,10 +906,10 @@ public ToLuceneContentHandler setIgnoreAllDocsWithout(Map hsFiel * indices into the initial indexWriter object. This invocation will be done automatically by the {@link Leech} class. This is for performance reasons because writing * into a Lucene index tends to get slow after a certain size. Splitting and merging afterwards is faster. Update: this behaviour depends on the Lucene version used, * currently this seems to be not a problem. Thus, this functionality is disabled per default. - * + * * @param iSplitIndexDocumentCount the document count a new index will be created. A good size is 500 000 (from my stomach feeling, if it is necessary). -1 in the - * case you want to disable SplitAndMerge, which is the default. - * + * case you want to disable SplitAndMerge, which is the default. + * * @return this */ public ToLuceneContentHandler setSplitAndMergeIndex(int iSplitIndexDocumentCount) @@ -739,9 +923,9 @@ public ToLuceneContentHandler setSplitAndMergeIndex(int iSplitIndexDocumentCount /** * Sets some attribute value pairs that will be added to every crawled document. - * + * * @param hsStaticAttValuePairs a multi value map containing the additional attribute value pairs - * + * * @return this */ public ToLuceneContentHandler setStaticAttributeValuePairs(MultiValueHashMap hsStaticAttValuePairs) @@ -750,244 +934,4 @@ public ToLuceneContentHandler setStaticAttributeValuePairs(MultiValueHashMap fieldName2Value : getStaticAttributeValuePairs().entryList()) - { - IndexableField field = m_fieldConfig.createField(fieldName2Value.getKey(), fieldName2Value.getValue()); - if(field != null) - doc.add(field); - else - Logger.getLogger(ToLuceneContentHandler.class.getName()).warning( - "Could not create lucene field for " + fieldName2Value.getKey() + ":" + fieldName2Value.getValue() + ". Will ignore it."); - } - } - - - - - /** - * Returns null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)}) - * - * @param metadata - * @param strFulltext - * - * @return null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)}) - * - * @throws Exception - */ - protected Document createAndFillLuceneDocument(Metadata metadata, String strFulltext) throws Exception - { - // // wir erstellen kein Document-Object neu, wenn es nicht unbedingt nötig ist - dazu merken wir uns die Referenzen auf die schon allokierten - // // Document Objekte - // // Document Object reuse - // Document doc = null; - // for (Document preAllocatedDoc : m_llAllocatedDocuments) - // { - // if(!m_llLastChildDocuments.contains(preAllocatedDoc)) - // { - // doc = preAllocatedDoc; - // LinkedList llFieldNames = new - // for (Fieldable field : doc.getFields()) - // doc.removeFields(field.name()); - // - // break; - // } - // } - // if(doc == null) - // { - // doc = new Document(); - // m_llAllocatedDocuments.add(doc); - // } - - Document doc = new Document(); - - - - // Das man kein Field aus einem reader machen kann ist der Grund, warum processNewMetaData den Fulltext als String und nicht als reader - // übergibt - - // eine eindeutige ID muß da sein - if(metadata.getValues(LeechMetadata.id).length == 0) doc.add(m_fieldConfig.createField(LeechMetadata.id, new UID().toString())); - if(!getFields2Ignore().contains(LeechMetadata.body)) doc.add(m_fieldConfig.createField(LeechMetadata.body, strFulltext)); - // die kopien - for (String strFieldCopy : getFieldCopyMap().get(LeechMetadata.body)) - if(!getFields2Ignore().contains(strFieldCopy)) doc.add(m_fieldConfig.createField(strFieldCopy, strFulltext)); - - - // die restlichen metadaten - for (String strFieldName : metadata.names()) - { - if(!getFields2Ignore().contains(strFieldName)) - { - for (String strValue : metadata.getValues(strFieldName)) - { - IndexableField field = m_fieldConfig.createField(strFieldName, strValue); - if(field != null) - doc.add(field); - else - Logger.getLogger(ToLuceneContentHandler.class.getName()).warning( - "Could not create lucene field for " + strFieldName + ":" + strValue + ". Will ignore it."); - } - - } - - // die kopien - for (String strFieldCopy : getFieldCopyMap().get(strFieldName)) - if(!getFields2Ignore().contains(strFieldCopy)) - { - for (String strValue : metadata.getValues(strFieldName)) - { - IndexableField field = m_fieldConfig.createField(strFieldCopy, strValue); - if(field != null) - doc.add(field); - else - Logger.getLogger(ToLuceneContentHandler.class.getName()).warning( - "Could not create lucene field for " + strFieldCopy + ":" + strValue + ". Will ignore it."); - } - } - } - - // die statischen Attribut-Value-Paare - addStaticAttValuePairs(doc); - - // und jetzt aggregieren wir noch - for (String strTargetAtt : getFieldAggregationMap().keySet()) - { - // wenn es das TargetAtt schon im doc gibt, dann aggregieren wir nix - if(doc.get(strTargetAtt) != null) continue; - - Collection colSourceAtts = getFieldAggregationMap().get(strTargetAtt); - - for (String strSourceAtt : colSourceAtts) - { - String strNewValue = metadata.get(strSourceAtt); - if(strNewValue == null) strNewValue = getStaticAttributeValuePairs().getFirst(strSourceAtt); - - if(strNewValue != null) - { - IndexableField field = m_fieldConfig.createField(strTargetAtt, strNewValue); - if(field != null) - doc.add(field); - else - Logger.getLogger(ToLuceneContentHandler.class.getName()).warning( - "Could not create lucene field for " + strTargetAtt + ":" + strNewValue + ". Will ignore it."); - - break; - } - } - } - - - - // wenn ein Doc nicht unseren constraints entspricht, dann ignorieren wir das hier, indem wir null zurück geben - if(m_hsFieldName2FieldValueConstraint == null || m_hsFieldName2FieldValueConstraint.size() == 0) return doc; - - for (Entry fieldname2fieldValRegEx : m_hsFieldName2FieldValueConstraint.entrySet()) - { - IndexableField[] fieldables = doc.getFields(fieldname2fieldValRegEx.getKey()); - for (IndexableField fieldable : fieldables) - { - String strVal = fieldable.stringValue(); - if(strVal.matches(fieldname2fieldValRegEx.getValue())) - { - // wir haben einen Treffer - return doc; - } - } - } - - - return null; - } - - - - - - - - protected void ensureConsumerThreadsRunning() - { - if(m_llConsumerThreads.size() != 0) return; - - int iCoreCount = Runtime.getRuntime().availableProcessors(); - int iThreadCount = (int) Math.round(iCoreCount / 2d); - iThreadCount = Math.max(iThreadCount, 1); - - m_cyclicBarrier4DocConsumerThreads = new CyclicBarrier(iThreadCount + 1); - for (int i = 0; i < iThreadCount; i++) - { - Thread consumerThread = new Thread(new DocConsumer(), "ToLuceneContentHandlerDocConsumer " + i); - m_llConsumerThreads.add(consumerThread); - consumerThread.setDaemon(true); - - consumerThread.start(); - } - } - - - - synchronized protected IndexWriter getCurrentWriter() throws CorruptIndexException, LockObtainFailedException, IOException - { - - - if(getSplitAndMergeIndex() <= 0) return m_initialLuceneWriter; - - if(m_luceneWriter.maxDoc() < getSplitAndMergeIndex()) return m_luceneWriter; - - - Directory directory = m_initialLuceneWriter.getDirectory(); - - Path fOurTmpDir = null; - if(directory instanceof FSDirectory) - { - if(m_luceneWriter != m_initialLuceneWriter) m_llIndexWriter2Close.add(m_luceneWriter); - - String strTmpPath = ((FSDirectory) directory).getDirectory().toAbsolutePath().toString(); - // if(strTmpPath.charAt(strTmpPath.length() - 1) == '/' || strTmpPath.charAt(strTmpPath.length() - 1) == '\\') - // strTmpPath = strTmpPath.substring(0, strTmpPath.length() - 1); - strTmpPath += "_" + (m_hsTmpLuceneWriterPaths2Merge.size() + 1); - fOurTmpDir = Paths.get(strTmpPath); - } - else - { - // wir brauchen was temporäres - File parentDir = new File(System.getProperty("java.io.tmpdir")); - fOurTmpDir = Paths.get(parentDir.getAbsolutePath() + "/leechTmp/" + UUID.randomUUID().toString().replaceAll("\\W", "_")); - } - - Logger.getLogger(ToLuceneContentHandler.class.getName()).info( - "Current index exceeds " + m_iSplitIndexDocumentCount + " documents. Will create another temporary one under " + fOurTmpDir); - - - @SuppressWarnings("deprecation") - IndexWriterConfig config = new IndexWriterConfig(m_initialLuceneWriter.getConfig().getAnalyzer()); - config.setOpenMode(OpenMode.CREATE); - - m_luceneWriter = new IndexWriter(new SimpleFSDirectory(fOurTmpDir), config); - m_hsTmpLuceneWriterPaths2Merge.add(fOurTmpDir.toAbsolutePath().toString()); - - return m_luceneWriter; - } - - - - @Override - protected void init() - { - Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Will write crawled data into " + m_luceneWriter.getDirectory().toString()); - - ensureConsumerThreadsRunning(); - } - - - - - - - } diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/Buzzwords.java b/src/main/java/de/dfki/km/leech/lucene/basic/Buzzwords.java new file mode 100644 index 0000000..8ed2e52 --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/Buzzwords.java @@ -0,0 +1,954 @@ +package de.dfki.km.leech.lucene.basic; + + + +import de.dfki.inquisitor.collections.MultiValueTreeMap; +// import de.dfki.inquisitor.lucene.DynamicFieldType; +// import de.dfki.inquisitor.lucene.*; +import de.dfki.inquisitor.text.Levenshtein; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.*; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.similarities.ClassicSimilarity; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.util.*; +import java.util.Map.Entry; + + + +/** + * The class Buzzwords extracts keywords out of documents - these can be in the form of lucene-documents, which enables to calculate the buzzwords very fast because the + * most information is still in the lucene index. But also strings can be processed, with an index as a base for calculation + * + * @author Christian Reuschling, Elisabeth Wolf + * + */ +public class Buzzwords +{ + + + static protected ClassicSimilarity m_defaultSimilarity = new ClassicSimilarity(); + + + + // + // /** + // * Adds calculated buzzwords to the given document. The method makes use of the IndexAccessor default Analyzer. + // * + // * @param doc2modify the document that should enriched with a new buzzword field + // * @param strIdFieldName the attribute name that should be used to identify the documents according to their id String + // * @param strNewField4Buzzwords the attribute that should be created for the buzzword. Becomes part of the document object + // * @param sAttNames4BuzzwordCalculation the attributes that should be considered for buzzword generation + // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords the method should generate + // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability + // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document + // * object value of strIdFieldName. + // * + // * @return true in the case the document object was modified, false otherwise. The method do not modify the index entry + // * + // * @throws Exception + // */ + // static public boolean addBuzzwords(Document doc2modify, String strIdFieldName, String strNewField4Buzzwords, Set sAttNames4BuzzwordCalculation, + // int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws Exception + // { + // + // + // String strDocID = getAttributeValue(doc2modify, strIdFieldName); + // List lBuzzwords = getBuzzwords(strDocID, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, bSkipSimilarTerms, hsIndexPaths); + // + // // wenn es keinen Content gibt, mache mer gar nix + // if(lBuzzwords == null) return false; + // + // StringBuilder strbBuzzWordz = new StringBuilder(); + // + // for (int i = 0; i < Math.min(iMaxNumberOfBuzzwords, lBuzzwords.size()); i++) + // strbBuzzWordz.append(lBuzzwords.get(i)).append(" "); + // + // + // // wenn es das Buzzword-feld schon gibt, wirds gelöscht + // doc2modify.removeFields(strNewField4Buzzwords); + // // die neu berechneten Buzzwords werden zum Doc hinzugefügt + // doc2modify.add(new TextWithTermVectorOffsetsField(strNewField4Buzzwords, strbBuzzWordz.toString())); + // + // + // return true; + // } + + + + /** + * Gets the value of an attribute inside the document as String. + * + * @param doc + * @param strFieldName the attributes name + * + * @return the first attribute value under the given attribute name + */ + private static String getAttributeValue(Document doc, String strFieldName) + { + + IndexableField docAtt = doc.getField(strFieldName); + if(docAtt == null) return null; + + + return docAtt.stringValue(); + } + + + + // + // /** + // * Gets the buzzwords for fields of a document. The metohd makes use of the IndexAccessor default Analyzer. + // * + // * @param strDocID the ID of the document from which the buzzwords should be extracted + // * @param sAttNames4BuzzwordCalculation the name of the attributes the buzzwords should be extracted from + // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords + // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability + // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document + // * object value of strIdFieldName. + // * + // * @return the list of the extracted buzzwords, null in the case the given attribute doesn't exist + // * + // * @throws CorruptIndexException + // * @throws IOException + // * @throws URINotFoundException + // * @throws URISyntaxException + // */ + // static public List getBuzzwords(String strDocID, Set sAttNames4BuzzwordCalculation, int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms, + // LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, URINotFoundException, URISyntaxException + // { + // + // LinkedHashMap buzzwordsWithTfIdf = + // getBuzzwordsWithTfIdf(strDocID, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, bSkipSimilarTerms, hsIndexPaths); + // + // LinkedList llBuzzwords = new LinkedList(buzzwordsWithTfIdf.keySet()); + // + // + // return llBuzzwords; + // } + + + // + // + // /** + // * Gets the buzzwords for fields of a document. The metohd makes use of the IndexAccessor default Analyzer. + // * + // * @param strDocID the ID of the document from which the buzzwords should be extracted + // * @param strFieldName the name of the attribute the buzzwords should be extracted from + // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords + // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability + // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document + // * object value of strIdFieldName. + // * + // * @return the list of the extracted buzzwords, null in the case the given attribute doesn't exist + // * @throws CorruptIndexException + // * @throws IOException + // * @throws URINotFoundException + // * @throws URISyntaxException + // */ + // static public List> getBuzzwords4AllFieldValues(String strDocID, String strFieldName, int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms, + // LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, URINotFoundException, URISyntaxException + // { + // + // List> buzzwordsWithTfIdfMaps = + // getBuzzwordsWithTfIdf4AllFieldValues(strDocID, strFieldName, iMaxNumberOfBuzzwords, bSkipSimilarTerms, hsIndexPaths); + // + // LinkedList> llbuzzwords4AllFieldValues = new LinkedList>(); + // for (LinkedHashMap hsBuzzwords2TfIdf : buzzwordsWithTfIdfMaps) + // { + // + // LinkedList llBuzzwords = new LinkedList(hsBuzzwords2TfIdf.keySet()); + // + // llbuzzwords4AllFieldValues.add(llBuzzwords); + // } + // + // + // return llbuzzwords4AllFieldValues; + // } + + + // + // + // /** + // * Gets the buzzwords for fields of a document, together with their document TfIdf value. The metohd makes use of the IndexAccessor default Analyzer. + // * + // * @param strDocID the ID of the document from which the buzzwords should be extracted + // * @param sAttNames4BuzzwordCalculation the name of the attributes the buzzwords should be extracted from. + // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords + // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability + // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document + // * object value of strIdFieldName. + // * + // * @return the extracted buzzwords, boosted according their score. Key: the term itself. Value: the according score. null in the case the given attribute doesn't + // * exist. + // * @throws CorruptIndexException + // * @throws IOException + // * @throws URINotFoundException + // * @throws URISyntaxException + // */ + // static public LinkedHashMap getBuzzwordsWithTfIdf(String strDocID, Set sAttNames4BuzzwordCalculation, int iMaxNumberOfBuzzwords, + // boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, URINotFoundException, URISyntaxException + // { + // + // MultiValueTreeMap tmScore2Term = + // retrieveInterestingTerms(strDocID, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, 2, 1, 2, bSkipSimilarTerms, hsIndexPaths); + // + // if(tmScore2Term.valueSize() < iMaxNumberOfBuzzwords) + // { + // + // MultiValueTreeMap tmScore2TermWeak = + // retrieveInterestingTerms(strDocID, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, 1, 1, 2, bSkipSimilarTerms, hsIndexPaths); + // + // while (tmScore2TermWeak.keySize() > 0) + // { + // Float fTfIdf = tmScore2TermWeak.firstKey(); + // String strTopTerm = tmScore2TermWeak.getFirst(fTfIdf); + // tmScore2TermWeak.remove(fTfIdf, strTopTerm); + // + // if(!tmScore2Term.containsValue(strTopTerm)) tmScore2Term.add(fTfIdf, strTopTerm); + // + // if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords) break; + // } + // } + // + // LinkedHashMap hsTerm2TfIdf = new LinkedHashMap(); + // for (Entry score2term : tmScore2Term.entryList()) + // hsTerm2TfIdf.put(score2term.getValue(), score2term.getKey()); + // + // + // return hsTerm2TfIdf; + // } + + + + // + // /** + // * This method is for calculating buzzwords out of an arbritrary String, by giving an index attribute as 'context. The string will be tokenized according the given + // * analyzer for this attribute (as set by the IndexAccessor default analyzer), and also takes the document frequencies for all terms of this attribute. + // * + // * @param strDocumentText the text of the document. This text influences the buzzword calculation as it would be an attribute value of + // * strAttributeName4BuzzwordCalculation + // * @param strAttributeName4BuzzwordCalculation this is the name of the attribute the given text should be differentiated against with buzzwords + // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords + // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability + // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document + // * object value of strIdFieldName. + // * + // * @return the extracted buzzwords, with their according tfidf value, sorted by TfIdf values. Key: the term itself. Value: the tfIdf value. + // * + // * @throws CorruptIndexException + // * @throws IOException + // * @throws URINotFoundException + // * @throws URISyntaxException + // */ + // static public LinkedHashMap getBuzzwordsWithTfIdf(String strDocumentText, String strAttributeName4BuzzwordCalculation, int iMaxNumberOfBuzzwords, + // boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, URINotFoundException, URISyntaxException + // { + // MultiValueTreeMap tmScore2Term = + // retrieveInterestingTerms(strDocumentText, strAttributeName4BuzzwordCalculation, iMaxNumberOfBuzzwords, 2, 1, 2, bSkipSimilarTerms, hsIndexPaths); + // + // if(tmScore2Term.valueSize() < iMaxNumberOfBuzzwords) + // { + // + // MultiValueTreeMap tmScore2TermWeak = + // retrieveInterestingTerms(strDocumentText, strAttributeName4BuzzwordCalculation, iMaxNumberOfBuzzwords, 1, 1, 2, bSkipSimilarTerms, hsIndexPaths); + // + // while (tmScore2TermWeak.keySize() > 0) + // { + // Float fTfIdf = tmScore2TermWeak.firstKey(); + // String strTopTerm = tmScore2TermWeak.getFirst(fTfIdf); + // tmScore2TermWeak.remove(fTfIdf, strTopTerm); + // + // if(!tmScore2Term.containsValue(strTopTerm)) tmScore2Term.add(fTfIdf, strTopTerm); + // + // if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords) break; + // } + // } + // + // LinkedHashMap hsTerm2TfIdf = new LinkedHashMap(); + // for (Entry score2term : tmScore2Term.entryList()) + // hsTerm2TfIdf.put(score2term.getValue(), score2term.getKey()); + // + // + // return hsTerm2TfIdf; + // + // } + + + + // /** + // * Gets the buzzwords for fields of a document, together with their document TfIdf value. The metohd makes use of the IndexAccessor default Analyzer. + // * + // * @param strDocID the ID of the document from which the buzzwords should be extracted + // * @param strFieldName the name of the attribute the buzzwords should be extracted from. + // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords + // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability + // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document + // * object value of strIdFieldName. + // * + // * @return the extracted buzzwords, boosted according their score. Key: the term itself. Value: the according score. null in the case the given attribute doesn't + // * exist. + // * @throws CorruptIndexException + // * @throws IOException + // * @throws URINotFoundException + // * @throws URISyntaxException + // */ + // static public List> getBuzzwordsWithTfIdf4AllFieldValues(String strDocID, String strFieldName, int iMaxNumberOfBuzzwords, + // boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, URINotFoundException, URISyntaxException + // { + // + // List> tmScore2TermMaps = + // retrieveInterestingTerms4AllFieldValues(strDocID, strFieldName, iMaxNumberOfBuzzwords, 2, 1, 2, bSkipSimilarTerms, hsIndexPaths); + // + // // aus Performancegründen verzichte ich hier mal auf eine 'weichere' Strategie, falls unsere Maximalanzahl der Buzzwords nicht erreicht wurde + // + // LinkedList> hsTerm2ScoreMaps = new LinkedList>(); + // + // for (MultiValueTreeMap hsScore2Term : tmScore2TermMaps) + // { + // LinkedHashMap hsTerm2TfIdf = new LinkedHashMap(); + // for (Entry score2term : hsScore2Term.entryList()) + // hsTerm2TfIdf.put(score2term.getValue(), score2term.getKey()); + // + // hsTerm2ScoreMaps.add(hsTerm2TfIdf); + // } + // + // + // return hsTerm2ScoreMaps; + // } + + + + + /** + * Adds calculated buzzwords to the given document. The method makes use of the IndexAccessor default Analyzer. + * + * @param iDocNo the lucene document number inside the index behind reader, for the document doc2modify + * @param doc2modify the document that should enriched with a new buzzword field + * @param strNewField4Buzzwords the attribute that should be created for the buzzword. Becomes part of the document object + * @param sAttNames4BuzzwordCalculation the attributes that should be considered for buzzword generation + * @param iMaxNumberOfBuzzwords the maximum number of buzzwords the method should generate + * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability + * @param reader the lucene index reader + * + * @return true in the case the document object was modified, false otherwise. The method do not modify the index entry + * + * @throws Exception + */ + static public boolean addBuzzwords(int iDocNo, Document doc2modify, String strNewField4Buzzwords, Set sAttNames4BuzzwordCalculation, + int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms, IndexReader reader) throws Exception + { + + + List lBuzzwords = getBuzzwords(iDocNo, doc2modify, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, bSkipSimilarTerms, reader); + + // wenn es keinen Content gibt, mache mer gar nix + if(lBuzzwords == null) return false; + + StringBuilder strbBuzzWordz = new StringBuilder(); + + for (int i = 0; i < Math.min(iMaxNumberOfBuzzwords, lBuzzwords.size()); i++) + strbBuzzWordz.append(lBuzzwords.get(i)).append(" "); + + + // wenn es das Buzzword-feld schon gibt, wirds gelöscht + doc2modify.removeFields(strNewField4Buzzwords); + // die neu berechneten Buzzwords werden zum Doc hinzugefügt + FieldType fieldType = + new DynamicFieldType().setIndexOptionS(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS).setStoreD(true).setStoreTermVectorS(true) + .setStoreTermVectorOffsetS(true).setTokenizeD(true).freezE(); + + Field field4buzzwords = new Field(strNewField4Buzzwords, strbBuzzWordz.toString(), fieldType); + doc2modify.add(field4buzzwords); + + + return true; + } + + + + static protected int docID2DocNo(String strDocIdAttributeName, String strDocID, IndexReader reader) throws Exception + { + int luceneDocumentNumber; + + IndexSearcher searcher = new IndexSearcher(reader); + + TopDocs topDocs = searcher.search(new TermQuery(new Term(strDocIdAttributeName, strDocID)), 1); + + if(topDocs.totalHits == 0) throw new Exception("no lucene document found with id '" + strDocID + "'"); + + // es sollte lediglich ein Dokument mit dieser id aufzufinden sein... + luceneDocumentNumber = topDocs.scoreDocs[0].doc; + + return luceneDocumentNumber; + } + + + + + + + + /** + * Gets the buzzwords for fields of a document. The metohd makes use of the IndexAccessor default Analyzer. + * + * @param iDocNo the lucene document number inside the index behind reader, for the document doc2modify + * @param doc2modify the document that should enriched with a new buzzword field + * @param sAttNames4BuzzwordCalculation the name of the attributes the buzzwords should be extracted from + * @param iMaxNumberOfBuzzwords the maximum number of buzzwords + * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability + * @param reader the lucene index reader + * + * @return the list of the extracted buzzwords, null in the case the given attribute doesn't exist + * + * @throws Exception + * @throws URINotFoundException + */ + static public List getBuzzwords(int iDocNo, Document doc2modify, Set sAttNames4BuzzwordCalculation, int iMaxNumberOfBuzzwords, + boolean bSkipSimilarTerms, IndexReader reader) throws Exception + { + + LinkedHashMap buzzwordsWithTfIdf = + getBuzzwordsWithTfIdf(iDocNo, doc2modify, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, bSkipSimilarTerms, reader); + + LinkedList llBuzzwords = new LinkedList(buzzwordsWithTfIdf.keySet()); + + + return llBuzzwords; + } + + + + /** + * Gets the buzzwords for fields of a document, together with their document TfIdf value. The metohd makes use of the IndexAccessor default Analyzer. + * + * @param iDocNo the lucene document number inside the index behind reader, for the document doc2modify + * @param doc2modify the document that should enriched with a new buzzword field + * @param sAttNames4BuzzwordCalculation the name of the attributes the buzzwords should be extracted from. + * @param iMaxNumberOfBuzzwords the maximum number of buzzwords + * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability + * @param reader the lucene index reader + * + * @return the extracted buzzwords, boosted according their score. Key: the term itself. Value: the according score. null in the case the given attribute doesn't + * exist. + * + * @throws Exception + */ + static public LinkedHashMap getBuzzwordsWithTfIdf(int iDocNo, Document doc2modify, Set sAttNames4BuzzwordCalculation, + int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms, IndexReader reader) throws Exception + { + + MultiValueTreeMap tmScore2Term = + retrieveInterestingTerms(iDocNo, doc2modify, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, 2, 1, 2, bSkipSimilarTerms, reader); + + if(tmScore2Term.valueSize() < iMaxNumberOfBuzzwords) + { + + MultiValueTreeMap tmScore2TermWeak = + retrieveInterestingTerms(iDocNo, doc2modify, sAttNames4BuzzwordCalculation, iMaxNumberOfBuzzwords, 1, 1, 2, bSkipSimilarTerms, reader); + + while (tmScore2TermWeak.keySize() > 0) + { + Float fTfIdf = tmScore2TermWeak.firstKey(); + String strTopTerm = tmScore2TermWeak.getFirst(fTfIdf); + tmScore2TermWeak.remove(fTfIdf, strTopTerm); + + if(!tmScore2Term.containsValue(strTopTerm)) tmScore2Term.add(fTfIdf, strTopTerm); + + if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords) break; + } + } + + LinkedHashMap hsTerm2TfIdf = new LinkedHashMap(); + for (Entry score2term : tmScore2Term.entryList()) + hsTerm2TfIdf.put(score2term.getValue(), score2term.getKey()); + + + return hsTerm2TfIdf; + } + + + + /** + * + * @param iDocNo the lucene document number inside the index behind reader, for the document doc2modify + * @param doc2modify the document that should enriched with a new buzzword field + * @param strFieldName the field where you want the top frequent terms for. + * @param iMinFrequency the minimum frequency a term must appear in this field + * @param iMinWordLength the minimum word length a term must have + * @param iMaxNumberOfTerms the maximum number of terms the method returns + * @param reader the lucene index reader + * + * @return + * + * @throws Exception + */ + public static List getTopFrequentTerms(int iDocNo, Document doc2modify, String strFieldName, int iMinFrequency, int iMinWordLength, + int iMaxNumberOfTerms, IndexReader reader) throws Exception + { + + LinkedList llTerm2Frequency = new LinkedList(); + PriorityQueue pqTerm2Frequency = new PriorityQueue(iMaxNumberOfTerms, new Comparator() + { + + @Override + public int compare(Term2FrequencyEntry o1, Term2FrequencyEntry o2) + { + return o1.getFrequency().compareTo(o2.getFrequency()); + } + }); + + // wenn es das feld gar nicht gibt in diesem doc, dann machen wir gar nix! (das überprüfen ist erheblich billiger als das unnötige iterieren durch alles im reader + if(doc2modify.getField(strFieldName) == null) return llTerm2Frequency; + + Terms termVector = reader.getTermVector(iDocNo, strFieldName); + if(termVector == null) return llTerm2Frequency; + + TermsEnum termsEnum = termVector.iterator(); + + while (termsEnum.next() != null) + { + String strTerm = termsEnum.term().utf8ToString(); + long lFrequency = termsEnum.totalTermFreq(); + + if(lFrequency >= iMinFrequency && strTerm.length() >= iMinWordLength) + pqTerm2Frequency.add(new Term2FrequencyEntry(strTerm, Long.valueOf(lFrequency).intValue())); + + if(pqTerm2Frequency.size() > iMaxNumberOfTerms) pqTerm2Frequency.poll(); + } + + for (Term2FrequencyEntry term2Frq : pqTerm2Frequency) + llTerm2Frequency.add(0, term2Frq); + + + + return llTerm2Frequency; + } + + + + static MultiValueTreeMap retrieveInterestingTerms(int iDocNo, Document doc2modify, Set sAttNames4BuzzwordCalculation, + int iMaxNumberOfBuzzwords, int iMinDocFreq, int iMinTermFreq, int iMinWordLen, boolean bSkipSimilarTerms, IndexReader reader) throws Exception + { + + int iIndexDocumentCount = reader.numDocs(); + + HashMap hsTerm2Frequency = new HashMap(); + + // als erstes werden die frequencies aller fields aufsummiert + for (String strFieldName : sAttNames4BuzzwordCalculation) + { + + // XXX: hier ist erst mal die Anzahl der verschiedenen Terme des docs hartkodiert + List topFrequentTerms = getTopFrequentTerms(iDocNo, doc2modify, strFieldName, iMinTermFreq, iMinWordLen, 1234, reader); + + for (Term2FrequencyEntry topTerm2FreqLocal : topFrequentTerms) + { + Integer iFreqOld = hsTerm2Frequency.get(topTerm2FreqLocal.getTerm()); + if(iFreqOld == null) + iFreqOld = topTerm2FreqLocal.getFrequency(); + else + iFreqOld += topTerm2FreqLocal.getFrequency(); + + hsTerm2Frequency.put(topTerm2FreqLocal.getTerm(), iFreqOld); + } + } + + // nun werden die Terme bezüglich ihres scores (tfIdf) sortiert + MultiValueTreeMap tmScore2Term = new MultiValueTreeMap(HashSet.class); + for (Entry term2Frequency : hsTerm2Frequency.entrySet()) + { + String strTerm = term2Frequency.getKey(); + Integer iTermFrequency = term2Frequency.getValue(); + + // wir haben angegeben, wie oft der Term mindestens da sein muß + if(iMinTermFreq > 0 && iTermFrequency < iMinTermFreq) continue; + + // Zahlen ignorieren wir + if(!strTerm.matches("\\D+")) continue; + + // es wird die max-docFrequency berücksichtig (wie in MoreLikeThis) + int iMaxDocumentFrequency = 0; + for (String strField : sAttNames4BuzzwordCalculation) + { + int iDocumentFrequency = reader.docFreq(new Term(strField, strTerm)); + if(iMaxDocumentFrequency < iDocumentFrequency) iMaxDocumentFrequency = iDocumentFrequency; + } + + if(iMinDocFreq > 0 && iMaxDocumentFrequency < iMinDocFreq) continue; + + // das sollte eigentlich nicht passieren - im Fehlerfall ignorieren wir das einfach + if(iMaxDocumentFrequency == 0) continue; + + // das ist die Formel der defaultSimilarity. Eine andere werden wir einfach nie brauchen + float fIdf = m_defaultSimilarity.idf(iMaxDocumentFrequency, iIndexDocumentCount); + float fScore = m_defaultSimilarity.tf(iTermFrequency) * fIdf * fIdf; + + boolean bRemoveLastTerm4Score = false; + // nur die top -Terme - wenn wir über die max-Anzahl sind, dann tauschen wir den kleinsten aus + if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords) + { + // wir sind drüber + // wenn unser kleinster schon größer ist, dann ignorieren wir den neuen + if(tmScore2Term.firstKey() >= fScore) continue; + // ansonsten tauschen wir unseren kleinsten aus + bRemoveLastTerm4Score = true; + } + + + // wir schauen, ob wir schon einen term drin haben, der uns sehr ähnlich sieht - dann nehmen wir den mit dem höchsten score (alternativ + // wäre auch der kürzere möglich, aber der könnte einen niederen score haben, und dann später wieder rausfliegen - das würde die Qualität + // verschlechtern) + Boolean bBetterSimilarTermInList = false; + if(bSkipSimilarTerms) + { + for (Entry score2TermInList : tmScore2Term.entryList()) + { + if(!Levenshtein.isInDistance(score2TermInList.getValue(), strTerm, 3)) continue; + // wenn der existierende größer ist, dann brauchen wir gar nix eintragen + if(score2TermInList.getKey() >= fScore) + { + bBetterSimilarTermInList = true; + break; + } + // wenn der neue vom score her besser ist, dann müssen wir den austauschen + tmScore2Term.remove(score2TermInList.getKey(), score2TermInList.getValue()); + } + } + + if(bRemoveLastTerm4Score && !bBetterSimilarTermInList) tmScore2Term.remove(tmScore2Term.firstKey()); + if(!bBetterSimilarTermInList) tmScore2Term.add(fScore, strTerm); + } + + + return tmScore2Term; + } + + + + + + + // static MultiValueTreeMap retrieveInterestingTerms(String strDocID, Set sAttNames4BuzzwordCalculation, int iMaxNumberOfBuzzwords, + // int iMinDocFreq, int iMinTermFreq, int iMinWordLen, boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, + // URINotFoundException, URISyntaxException + // { + // + // RemoteIndexReader reader = IndexAccessor.getMultiIndexReader(hsIndexPaths, true); + // int iIndexDocumentCount = reader.numDocs(); + // + // HashMap hsTerm2Frequency = new HashMap(); + // + // // als erstes werden die frequencies aller fields aufsummiert + // for (String strFieldName : sAttNames4BuzzwordCalculation) + // { + // + // // XXX: hier ist erst mal die Anzahl der verschiedenen Terme des docs hartkodiert + // List topFrequentTerms = reader.getTopFrequentTerms(strDocID, strFieldName, iMinTermFreq, iMinWordLen, 1234); + // + // for (Term2FrequencyEntry topTerm2FreqLocal : topFrequentTerms) + // { + // Integer iFreqOld = hsTerm2Frequency.get(topTerm2FreqLocal.getTerm()); + // if(iFreqOld == null) + // iFreqOld = topTerm2FreqLocal.getFrequency(); + // else + // iFreqOld += topTerm2FreqLocal.getFrequency(); + // + // hsTerm2Frequency.put(topTerm2FreqLocal.getTerm(), iFreqOld); + // } + // } + // + // // nun werden die Terme bezüglich ihres scores (tfIdf) sortiert + // MultiValueTreeMap tmScore2Term = new MultiValueTreeMap(HashSet.class); + // for (Entry term2Frequency : hsTerm2Frequency.entrySet()) + // { + // String strTerm = term2Frequency.getKey(); + // Integer iTermFrequency = term2Frequency.getValue(); + // + // // wir haben angegeben, wie oft der Term mindestens da sein muß + // if(iMinTermFreq > 0 && iTermFrequency < iMinTermFreq) continue; + // + // // Zahlen ignorieren wir + // if(!strTerm.matches("\\D+")) continue; + // + // // es wird die max-docFrequency berücksichtig (wie in MoreLikeThis) + // int iMaxDocumentFrequency = 0; + // for (String strField : sAttNames4BuzzwordCalculation) + // { + // int iDocumentFrequency = reader.documentFrequency(strField, strTerm); + // if(iMaxDocumentFrequency < iDocumentFrequency) iMaxDocumentFrequency = iDocumentFrequency; + // } + // + // if(iMinDocFreq > 0 && iMaxDocumentFrequency < iMinDocFreq) continue; + // + // // das sollte eigentlich nicht passieren - im Fehlerfall ignorieren wir das einfach + // if(iMaxDocumentFrequency == 0) continue; + // + // // das ist die Formel der defaultSimilarity. Eine andere werden wir einfach nie brauchen + // float fIdf = m_defaultSimilarity.idf(iMaxDocumentFrequency, iIndexDocumentCount); + // float fScore = m_defaultSimilarity.tf(iTermFrequency) * fIdf * fIdf; + // + // boolean bRemoveLastTerm4Score = false; + // // nur die top -Terme - wenn wir über die max-Anzahl sind, dann tauschen wir den kleinsten aus + // if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords) + // { + // // wir sind drüber + // // wenn unser kleinster schon größer ist, dann ignorieren wir den neuen + // if(tmScore2Term.firstKey() >= fScore) continue; + // // ansonsten tauschen wir unseren kleinsten aus + // bRemoveLastTerm4Score = true; + // } + // + // + // // wir schauen, ob wir schon einen term drin haben, der uns sehr ähnlich sieht - dann nehmen wir den mit dem höchsten score (alternativ + // // wäre auch der kürzere möglich, aber der könnte einen niederen score haben, und dann später wieder rausfliegen - das würde die Qualität + // // verschlechtern) + // Boolean bBetterSimilarTermInList = false; + // if(bSkipSimilarTerms) + // { + // for (Entry score2TermInList : tmScore2Term.entryList()) + // { + // if(!Levenshtein.isInDistance(score2TermInList.getValue(), strTerm, 3)) continue; + // // wenn der existierende größer ist, dann brauchen wir gar nix eintragen + // if(score2TermInList.getKey() >= fScore) + // { + // bBetterSimilarTermInList = true; + // break; + // } + // // wenn der neue vom score her besser ist, dann müssen wir den austauschen + // tmScore2Term.remove(score2TermInList.getKey(), score2TermInList.getValue()); + // } + // } + // + // if(bRemoveLastTerm4Score && !bBetterSimilarTermInList) tmScore2Term.remove(tmScore2Term.firstKey()); + // if(!bBetterSimilarTermInList) tmScore2Term.add(fScore, strTerm); + // } + // + // + // return tmScore2Term; + // } + + + // + // /** + // * This method is for calculating buzzwords out of an arbritrary String, by giving an index attribute as 'context. The string will be tokenized according the given + // * analyzer for this attribute (as set by the IndexAccessor default analyzer), and also takes the document frequencies for all terms of this attribute. + // * + // * @param strDocumentText the text of the document. This text influences the buzzword calculation as it would be an attribute value of + // * strAttributeName4BuzzwordCalculation + // * @param strAttributeName4BuzzwordCalculation this is the name of the attribute the given text should be differentiated against with buzzwords + // * @param iMaxNumberOfBuzzwords the maximum number of buzzwords + // * @param iMinDocFreq + // * @param iMinTermFreq + // * @param iMinWordLen + // * @param bSkipSimilarTerms true: similar terms (according to the Levenshtein-distance) will be skipped for better readability + // * @param hsIndexPaths the list of indices that should be used for buzzword calculation. The document must be stored in exactly one index, referenced by the document + // * object value of strIdFieldName. + // * + // * @return the extracted buzzwords, sorted by their according tfidf value. Key: the tfIdf value. Value: the term. + // * + // * @throws CorruptIndexException + // * @throws IOException + // * @throws URINotFoundException + // * @throws URISyntaxException + // */ + // static MultiValueTreeMap retrieveInterestingTerms(String strDocumentText, String strAttributeName4BuzzwordCalculation, int iMaxNumberOfBuzzwords, + // int iMinDocFreq, int iMinTermFreq, int iMinWordLen, boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, + // URINotFoundException, URISyntaxException + // { + // + // RemoteIndexReader reader = IndexAccessor.getMultiIndexReader(hsIndexPaths, true); + // int iIndexDocumentCount = reader.numDocs(); + // + // // hier tokenisieren wir den übergebenen Text und ermitteln die term frequencies + // HashMap hsTerm2Frequency = new HashMap(); + // + // TokenStream tokenStream = IndexAccessor.getDefaultAnalyzer().tokenStream(strAttributeName4BuzzwordCalculation, strDocumentText); + // + // tokenStream.reset(); + // while (tokenStream.incrementToken()) + // { + // // hier ermitteln wir die termfrequenzen für das aktuelle AttValue + // CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); + // String strTerm = termAttribute.toString(); + // + // Integer iFrequency = hsTerm2Frequency.get(strTerm); + // if(iFrequency == null) + // hsTerm2Frequency.put(strTerm, 1); + // else + // hsTerm2Frequency.put(strTerm, iFrequency + 1); + // } + // tokenStream.close(); + // + // + // + // // nun werden die Terme bezüglich ihres scores (tfIdf) sortiert + // MultiValueTreeMap tmScore2Term = new MultiValueTreeMap(HashSet.class); + // for (Entry term2Frequency : hsTerm2Frequency.entrySet()) + // { + // String strTerm = term2Frequency.getKey(); + // Integer iTermFrequency = term2Frequency.getValue(); + // + // + // if(strTerm.length() < iMinWordLen) continue; + // // wir haben angegeben, wie oft der Term mindestens da sein muß + // if(iMinTermFreq > 0 && iTermFrequency < iMinTermFreq) continue; + // + // // Zahlen ignorieren wir + // if(!strTerm.matches("\\D+")) continue; + // + // int iDocumentFrequency = reader.documentFrequency(strAttributeName4BuzzwordCalculation, strTerm); + // + // if(iMinDocFreq > 0 && iDocumentFrequency < iMinDocFreq) continue; + // + // // das sollte eigentlich nicht passieren - im Fehlerfall ignorieren wir das einfach + // if(iDocumentFrequency == 0) continue; + // + // // das ist die Formel der defaultSimilarity. Eine andere werden wir einfach nie brauchen + // float fIdf = m_defaultSimilarity.idf(iDocumentFrequency, iIndexDocumentCount); + // float fScore = m_defaultSimilarity.tf(iTermFrequency) * fIdf * fIdf; + // + // boolean bRemoveLastTerm4Score = false; + // // nur die top -Terme - wenn wir über die max-Anzahl sind, dann tauschen wir den kleinsten aus + // if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords) + // { + // // wir sind drüber + // // wenn unser kleinster schon größer ist, dann ignorieren wir den neuen + // if(tmScore2Term.firstKey() >= fScore) continue; + // // ansonsten tauschen wir unseren kleinsten aus + // bRemoveLastTerm4Score = true; + // } + // + // + // // wir schauen, ob wir schon einen term drin haben, der uns sehr ähnlich sieht - dann nehmen wir den mit dem höchsten score (alternativ + // // wäre auch der kürzere möglich, aber der könnte einen niederen score haben, und dann später wieder rausfliegen - das würde die Qualität + // // verschlechtern) + // Boolean bBetterSimilarTermInList = false; + // if(bSkipSimilarTerms) + // { + // for (Entry score2TermInList : tmScore2Term.entryList()) + // { + // if(!Levenshtein.isInDistance(score2TermInList.getValue(), strTerm, 3)) continue; + // // wenn der existierende größer ist, dann brauchen wir gar nix eintragen + // if(score2TermInList.getKey() >= fScore) + // { + // bBetterSimilarTermInList = true; + // break; + // } + // // wenn der neue vom score her besser ist, dann müssen wir den austauschen + // tmScore2Term.remove(score2TermInList.getKey(), score2TermInList.getValue()); + // } + // } + // + // if(bRemoveLastTerm4Score && !bBetterSimilarTermInList) tmScore2Term.remove(tmScore2Term.firstKey()); + // if(!bBetterSimilarTermInList) tmScore2Term.add(fScore, strTerm); + // } + // + // + // + // return tmScore2Term; + // } + + // + // + // static List> retrieveInterestingTerms4AllFieldValues(String strDocID, String strFieldName, int iMaxNumberOfBuzzwords, + // int iMinDocFreq, int iMinTermFreq, int iMinWordLen, boolean bSkipSimilarTerms, LinkedHashSet hsIndexPaths) throws CorruptIndexException, IOException, + // URINotFoundException, URISyntaxException + // { + // + // RemoteIndexReader reader = IndexAccessor.getMultiIndexReader(hsIndexPaths, true); + // int iIndexDocumentCount = reader.numDocs(); + // + // + // LinkedList> llScore2TermMaps = new LinkedList>(); + // + // // XXX: hier ist erst mal die Anzahl der verschiedenen Terme des docs hartkodiert + // for (List lTerm2Frequencies : reader.getTopFrequentTermsPerAttributeValue(strDocID, strFieldName, iMinTermFreq, iMinWordLen, 1234)) + // { + // + // // nun werden die Terme bezüglich ihres scores (tfIdf) sortiert + // MultiValueTreeMap tmScore2Term = new MultiValueTreeMap(HashSet.class); + // for (Term2FrequencyEntry term2Frequency : lTerm2Frequencies) + // { + // String strTerm = term2Frequency.getTerm(); + // Integer iTermFrequency = term2Frequency.getFrequency(); + // + // // wir haben angegeben, wie oft der Term mindestens da sein muß + // if(iMinTermFreq > 0 && iTermFrequency < iMinTermFreq) continue; + // + // // Zahlen ignorieren wir + // if(!strTerm.matches("\\D+")) continue; + // + // int iDocumentFrequency = reader.documentFrequency(strFieldName, strTerm); + // + // if(iMinDocFreq > 0 && iDocumentFrequency < iMinDocFreq) continue; + // + // // das sollte eigentlich nicht passieren - im Fehlerfall ignorieren wir das einfach + // if(iDocumentFrequency == 0) continue; + // + // // das ist die Formel der defaultSimilarity. Eine andere werden wir einfach nie brauchen + // float fIdf = m_defaultSimilarity.idf(iDocumentFrequency, iIndexDocumentCount); + // float fScore = m_defaultSimilarity.tf(iTermFrequency) * fIdf * fIdf; + // + // boolean bRemoveLastTerm4Score = false; + // // nur die top -Terme - wenn wir über die max-Anzahl sind, dann tauschen wir den kleinsten aus + // if(tmScore2Term.valueSize() >= iMaxNumberOfBuzzwords) + // { + // // wir sind drüber + // // wenn unser kleinster schon größer ist, dann ignorieren wir den neuen + // if(tmScore2Term.firstKey() >= fScore) continue; + // // ansonsten tauschen wir unseren kleinsten aus + // bRemoveLastTerm4Score = true; + // } + // + // + // // wir schauen, ob wir schon einen term drin haben, der uns sehr ähnlich sieht - dann nehmen wir den mit dem höchsten score + // // (alternativ + // // wäre auch der kürzere möglich, aber der könnte einen niederen score haben, und dann später wieder rausfliegen - das würde die + // // Qualität + // // verschlechtern) + // Boolean bBetterSimilarTermInList = false; + // if(bSkipSimilarTerms) + // { + // for (Entry score2TermInList : tmScore2Term.entryList()) + // { + // if(!Levenshtein.isInDistance(score2TermInList.getValue(), strTerm, 3)) continue; + // // wenn der existierende größer ist, dann brauchen wir gar nix eintragen + // if(score2TermInList.getKey() >= fScore) + // { + // bBetterSimilarTermInList = true; + // break; + // } + // // wenn der neue vom score her besser ist, dann müssen wir den austauschen + // tmScore2Term.remove(score2TermInList.getKey(), score2TermInList.getValue()); + // } + // } + // + // if(bRemoveLastTerm4Score && !bBetterSimilarTermInList) tmScore2Term.remove(tmScore2Term.firstKey()); + // if(!bBetterSimilarTermInList) tmScore2Term.add(fScore, strTerm); + // } + // + // llScore2TermMaps.add(tmScore2Term); + // } + // + // + // + // return llScore2TermMaps; + // } + + + + + + + +} diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/DocumentFrqClass.java b/src/main/java/de/dfki/km/leech/lucene/basic/DocumentFrqClass.java new file mode 100644 index 0000000..bf0fab3 --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/DocumentFrqClass.java @@ -0,0 +1,179 @@ +package de.dfki.km.leech.lucene.basic; + + + +// import de.dfki.inquisitor.lucene.DynamicFieldType; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.mapdb.DB; +import org.mapdb.DBMaker; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; + + + +public class DocumentFrqClass implements Closeable +{ + + + protected Map m_hsTerm2IndexFrq; + + protected long m_lMaxFrq = 0; + + protected DB m_mapDB; + + protected IndexReader m_reader; + + protected String m_strFieldName4Calculation; + + protected String m_strMaxFrqTerm = ""; + + + + @SuppressWarnings("unchecked") + public DocumentFrqClass(IndexReader reader, String strFieldName4Calculation) + { + m_reader = reader; + m_strFieldName4Calculation = strFieldName4Calculation; + + try + { + Logger.getLogger(DocumentFrqClass.class.getName()).info("load overall term index frequencies"); + + + // OLD: m_mapDB = DBMaker.newTempFileDB().deleteFilesAfterClose().closeOnJvmShutdown().transactionDisable().make(); + // m_hsTerm2IndexFrq = m_mapDB.getTreeMap("temp"); + m_mapDB = DBMaker.tempFileDB().closeOnJvmShutdown().fileDeleteAfterOpen().fileDeleteAfterClose().fileLockDisable().fileMmapEnableIfSupported().make(); + m_hsTerm2IndexFrq = (Map) m_mapDB.treeMap("temp").create(); + + + + Terms terms; + + terms = MultiFields.getTerms(reader, strFieldName4Calculation); + + + if(terms != null) + { + TermsEnum termsEnum = terms.iterator(); + + while (termsEnum.next() != null) + { + long lFrequency = termsEnum.totalTermFreq(); + String strTerm = termsEnum.term().utf8ToString(); + + m_hsTerm2IndexFrq.put(strTerm, lFrequency); + if(lFrequency > m_lMaxFrq) + { + m_lMaxFrq = lFrequency; + m_strMaxFrqTerm = strTerm; + } + } + } + + + Logger.getLogger(DocumentFrqClass.class.getName()).info("...finished"); + + } + catch (Throwable e) + { + Logger.getLogger(DocumentFrqClass.class.getName()).log(Level.SEVERE, "Error", e); + } + + } + + + + public boolean addDocumentFrequencyClass(int iDocNo, Document doc2modify, String strNewField4FrqClass) throws Exception + { + + boolean bModified = false; + if(doc2modify.getField(strNewField4FrqClass) != null) bModified = true; + + doc2modify.removeFields(strNewField4FrqClass); + + if(doc2modify.getField(m_strFieldName4Calculation) == null) return bModified; + + + double dAverageFrqClass = 0; + int iFrqClassesCount = 0; + + + + Terms termVector = m_reader.getTermVector(iDocNo, m_strFieldName4Calculation); + if(termVector == null) return bModified; + + TermsEnum termsEnum = termVector.iterator(); + + while (termsEnum.next() != null) + { + String strTerm = termsEnum.term().utf8ToString(); + // reine Zahlen sind draussen + if(strTerm.matches("\\d*")) continue; + // das zählt nur für dieses doc, siehe ApiDoc reader.getTermVector(..) + long lFrequencyInDoc = termsEnum.totalTermFreq(); + + + Long lFrequencyInIndex = m_hsTerm2IndexFrq.get(strTerm); + if(lFrequencyInIndex == null) continue; + + int iFrqClass; + if(m_lMaxFrq <= 0 || lFrequencyInIndex <= 0) + iFrqClass = -1; + else + iFrqClass = (int) Math.floor((Math.log((m_lMaxFrq / lFrequencyInIndex)) / Math.log(2))); + + if(iFrqClass >= 2) + { + dAverageFrqClass += iFrqClass * lFrequencyInDoc; + iFrqClassesCount += lFrequencyInDoc; + } + } + + + + if(iFrqClassesCount >= 0) dAverageFrqClass = dAverageFrqClass / iFrqClassesCount; + + // wir diskretisieren auf halbe Werte + dAverageFrqClass = Math.round(dAverageFrqClass * 2); + // als Integer, ohne Nachkommastellen (der eigentliche Wert mal 10) + int iAverageFrqClass = (int) (dAverageFrqClass * 5d); + + + + // und an das doc dran + FieldType fieldType = + new DynamicFieldType().setIndexOptionS(IndexOptions.DOCS).setStoreD(true).setStoreTermVectorS(true) + .setStoreTermVectorOffsetS(true).setTokenizeD(true).freezE(); + + Field field4buzzwords = new Field(strNewField4FrqClass, String.valueOf(iAverageFrqClass), fieldType); + + + doc2modify.add(field4buzzwords); + + + return true; + } + + + + @Override + public void close() throws IOException + { + if(m_mapDB != null) m_mapDB.close(); + m_mapDB = null; + m_hsTerm2IndexFrq = null; + m_reader = null; + } + +} diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/DynamicFieldType.java b/src/main/java/de/dfki/km/leech/lucene/basic/DynamicFieldType.java new file mode 100644 index 0000000..7434112 --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/DynamicFieldType.java @@ -0,0 +1,418 @@ +package de.dfki.km.leech.lucene.basic; + + + +import com.cedarsoftware.util.io.JsonReader; +import com.cedarsoftware.util.io.JsonWriter; +import de.dfki.inquisitor.text.DateParser; +import de.dfki.inquisitor.text.DateUtils; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.*; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.IndexOptions; + +import java.io.IOException; +import java.util.Date; +import java.util.logging.Level; +import java.util.logging.Logger; + +// import de.dfki.inquisitor.lucene.LuceneAnalyzerFactory; + + + +@SuppressWarnings("javadoc") +public class DynamicFieldType extends FieldType +{ + + + public static final DynamicFieldType doubleFieldType = new DynamicFieldType(LegacyDoubleField.TYPE_STORED).freezE(); + + public static final DynamicFieldType floatFieldType = new DynamicFieldType(LegacyFloatField.TYPE_STORED).freezE(); + + public static final DynamicFieldType integerFieldType = new DynamicFieldType(LegacyIntField.TYPE_STORED).freezE(); + + public static final DynamicFieldType dateFieldType = new DynamicFieldType(LegacyLongField.TYPE_STORED).setDateParsing(true).freezE(); + + public static final DynamicFieldType keywordFieldType = + new DynamicFieldType().setTokenizeD(true).setStoreD(true).setStoreTermVectorS(true).setStoreTermVectorOffsetS(true) + .setIndexOptionS(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS).setOmitNormS(true).setAnalyzer("org.apache.lucene.analysis.core.KeywordAnalyzer") + .freezE(); + + public static final DynamicFieldType longFieldType = new DynamicFieldType(LegacyLongField.TYPE_STORED).freezE(); + + public static final DynamicFieldType tokenizedFieldType = + new DynamicFieldType().setTokenizeD(true).setStoreD(true).setStoreTermVectorS(true).setStoreTermVectorOffsetS(true) + .setIndexOptionS(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS).setAnalyzer("de.dfki.km.leech.lucene.LeechSimpleAnalyzer").freezE(); + + + + /** + * Create Field instances, according to the configuration inside the given fieldType. Number fields will be generated, if a string value is given, it will be + * converted in the case the fieldType is a number type. Further, the method parses Strings for date if the fieldtype is of type {@link DynamicFieldType} and + * configured accordingly. You can also give number values for generating number or String fields fields (also according to the given fieldType). + * + * @param strAttName the attributes name + * @param attValue the attributes value + * @param fieldType the field type that influences the returned type of the field + * + * @return the field, with the configured fieldType. Null in the case the Field can not be generated out of the value. + */ + static public Field createField(String strAttName, Object attValue, FieldType fieldType) + { + try + { + if (attValue == null) + return null; + + + if (fieldType instanceof DynamicFieldType && ((DynamicFieldType) fieldType).getDateParsing() && attValue instanceof String) + { + Date parsedDate = DateParser.parseDateString((String) attValue); + if (parsedDate != null) + return new LegacyLongField(strAttName, DateUtils.date2Number(parsedDate), fieldType); + else + return null; + } + else if (attValue instanceof String) + { + + if (fieldType.numericType() == LegacyNumericType.INT) + return new LegacyIntField(strAttName, Integer.valueOf((String) attValue), fieldType); + else if (fieldType.numericType() == LegacyNumericType.LONG) + return new LegacyLongField(strAttName, Long.valueOf((String) attValue), fieldType); + else if (fieldType.numericType() == LegacyNumericType.FLOAT) + return new LegacyFloatField(strAttName, Float.valueOf((String) attValue), fieldType); + else if (fieldType.numericType() == LegacyNumericType.DOUBLE) + return new LegacyDoubleField(strAttName, Double.valueOf((String) attValue), fieldType); + else + return new Field(strAttName, (String) attValue, fieldType); + } + else if (attValue instanceof Number) + { + + if (fieldType.numericType() == LegacyNumericType.INT) + return new LegacyIntField(strAttName, ((Number) attValue).intValue(), fieldType); + else if (fieldType.numericType() == LegacyNumericType.LONG) + return new LegacyLongField(strAttName, ((Number) attValue).longValue(), fieldType); + else if (fieldType.numericType() == LegacyNumericType.FLOAT) + return new LegacyFloatField(strAttName, ((Number) attValue).floatValue(), fieldType); + else if (fieldType.numericType() == LegacyNumericType.DOUBLE) + return new LegacyDoubleField(strAttName, ((Number) attValue).doubleValue(), fieldType); + else + return new Field(strAttName, String.valueOf(attValue), fieldType); + } + else + return null; + } catch (Exception e) + { + Logger.getLogger(FieldConfig.class.getName()).log(Level.SEVERE, "Error", e); + return null; + } + } + protected String analyzer; + protected boolean dateParsing = false; + + public DynamicFieldType() + { + super(); + } + + + + public DynamicFieldType(FieldType ref) + { + super(ref); + } + + + + public Analyzer createAnalyzer() + { + try + { + + return LuceneAnalyzerFactory.createAnalyzer(getAnalyzer(), null); + } catch (Exception e) + { + Logger.getLogger(DynamicFieldType.class.getName()).log(Level.SEVERE, "Error", e); + return null; + } + } + + + + /** + * Create Field instances, according to the configuration inside the given fieldType. Number fields will be generated, if a string value is given, it will be + * converted in the case the fieldType is a number type. Further, the method parses Strings for date if the fieldtype is of type {@link DynamicFieldType} and + * configured accordingly. You can also give number values for generating number or String fields fields (also according to the given fieldType). + * + * @param strAttName the attributes name + * @param attValue the attributes value + * + * @return the field, with the configured fieldType. Null in the case the Field can not be generated out of the value. + */ + public Field createField(String strAttName, Object attValue) + { + return createField(strAttName, attValue, this); + } + + + + /** + * Same functionality as in upper class method, but returns this as sugar. + **/ + public DynamicFieldType freezE() + { + super.freeze(); + + return this; + } + + + + /** + * works only if this is not frozen yet + */ + public void fromJson(String strJson) + { + try + { + DynamicFieldType ref = (DynamicFieldType) JsonReader.jsonToJava(strJson); + + // this.setIndexed(ref.indexed()); + this.setStored(ref.stored()); + this.setTokenized(ref.tokenized()); + this.setStoreTermVectors(ref.storeTermVectors()); + this.setStoreTermVectorOffsets(ref.storeTermVectorOffsets()); + this.setStoreTermVectorPositions(ref.storeTermVectorPositions()); + this.setStoreTermVectorPayloads(ref.storeTermVectorPayloads()); + this.setOmitNorms(ref.omitNorms()); + this.setIndexOptions(ref.indexOptions()); + this.setDocValuesType(ref.docValuesType()); + this.setNumericType(ref.numericType()); + this.setNumericPrecisionStep(ref.numericPrecisionStep()); + + this.setAnalyzer(ref.getAnalyzer()); + } catch (IOException e) + { + throw new RuntimeException(e); + } + } + + + + /** + * Get the analyzer for this class. This is additionaly to the upper Lucene Fieldtype, for convinience. Returns this as sugar. + */ + public String getAnalyzer() + { + return this.analyzer; + } + + + + public boolean getDateParsing() + { + return dateParsing; + } + + + + /** + * Set the analyzer for this class. The given String is the full class name of the analyzer, that can be used with Class.forName(..). This is additionaly to the upper + * Lucene Fieldtype, for convinience. Returns this as sugar. + */ + public DynamicFieldType setAnalyzer(String analyzer) + { + this.analyzer = analyzer; + + return this; + } + + + + /** + * Specifies whether the values of this field should be parsed as date values or not. If true, all input strings will be parsed and written as according number into + * the index + * + * @return this as sugar + */ + public DynamicFieldType setDateParsing(boolean enableDateParsing) + { + this.dateParsing = enableDateParsing; + + return this; + } + + + + /** + * Same functionality as in upper class method, but returns this as sugar. + **/ + public DynamicFieldType setDocValuesTypE(DocValuesType type) + { + super.setDocValuesType(type); + + return this; + } + + + + /** + * Same functionality as in upper class method, but returns this as sugar. + **/ + public DynamicFieldType setIndexOptionS(IndexOptions value) + { + super.setIndexOptions(value); + + return this; + } + + + + // /** + // * Same functionality as in upper class method, but returns this as sugar. + // **/ + // public DynamicFieldType setIndexeD(boolean value) + // { + // super.setIndexed(value); + // + // return this; + // } + + + + /** + * Same functionality as in upper class method, but returns this as sugar. + **/ + public DynamicFieldType setNumericPrecisionSteP(int precisionStep) + { + super.setNumericPrecisionStep(precisionStep); + + return this; + } + + + + + /** + * Same functionality as in upper class method, but returns this as sugar. + **/ + public DynamicFieldType setNumericTypE(LegacyNumericType type) + { + super.setNumericType(type); + + return this; + } + + + + + /** + * Same functionality as in upper class method, but returns this as sugar. + **/ + public DynamicFieldType setOmitNormS(boolean value) + { + super.setOmitNorms(value); + + return this; + } + + + + + /** + * Same functionality as in upper class method, but returns this as sugar. + **/ + public DynamicFieldType setStoreD(boolean value) + { + super.setStored(value); + + return this; + } + + + + + /** + * Same functionality as in upper class method, but returns this as sugar. + **/ + public DynamicFieldType setStoreTermVectorOffsetS(boolean value) + { + super.setStoreTermVectorOffsets(value); + + return this; + } + + + + + /** + * Same functionality as in upper class method, but returns this as sugar. + **/ + public DynamicFieldType setStoreTermVectorPayloadS(boolean value) + { + super.setStoreTermVectorPayloads(value); + + return this; + } + + + + + /** + * Same functionality as in upper class method, but returns this as sugar. + **/ + public DynamicFieldType setStoreTermVectorPositionS(boolean value) + { + super.setStoreTermVectorPositions(value); + + return this; + } + + + + + /** + * Same functionality as in upper class method, but returns this as sugar. + **/ + public DynamicFieldType setStoreTermVectorS(boolean value) + { + super.setStoreTermVectors(value); + + return this; + } + + + + + /** + * Same functionality as in upper class method, but returns this as sugar. + **/ + public DynamicFieldType setTokenizeD(boolean value) + { + super.setTokenized(value); + + return this; + } + + + + + public String toJson(boolean bFormatIt) + { + try + { + String strJson = JsonWriter.objectToJson(this); + + if (bFormatIt) + strJson = JsonWriter.formatJson(strJson); + + // TODO abchecken, ob das noch nötig ist: https://github.com/jdereg/json-io/issues/19 + return strJson.replaceAll(",\\s*\"ordinal\":\\d+", ""); + } catch (IOException e) + { + throw new RuntimeException(e); + } + } +} diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/FieldConfig.java b/src/main/java/de/dfki/km/leech/lucene/basic/FieldConfig.java new file mode 100644 index 0000000..b352661 --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/FieldConfig.java @@ -0,0 +1,135 @@ +package de.dfki.km.leech.lucene.basic; + + + +import com.cedarsoftware.util.io.JsonReader; +import com.cedarsoftware.util.io.JsonWriter; +// import de.dfki.inquisitor.lucene.LuceneAnalyzerFactory; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; +import org.apache.lucene.document.Field; + +import java.io.IOException; +import java.util.HashMap; +import java.util.logging.Level; +import java.util.logging.Logger; + + + +public class FieldConfig +{ + + + + public DynamicFieldType defaultFieldType = new DynamicFieldType(); + + + + public HashMap fieldName2FieldType = new HashMap(); + + + + /** + * Creates a new Analyzer out of this {@link FieldConfig}, which is a {@link PerFieldAnalyzerWrapper} for all configured fields + * + * @return the according analyzer + * + * @throws Exception + */ + public Analyzer createAnalyzer() throws Exception + { + return LuceneAnalyzerFactory.createAnalyzer(this); + } + + + + + + + + /** + * Create Field instances, according to the fieldType mappings inside this {@link FieldConfig}. Number fields will be generated, if a string value is given, it will + * be converted in the case the fieldType is a number type. Further, the method parses Strings for date if the fieldtype is of type {@link DynamicFieldType} and + * configured accordingly. You can also give number values for generating number or String fields fields (also according to the given fieldType). + * + * @param strAttName the attributes name + * @param attValue the attributes value + * + * @return the field, with the configured fieldType. Null in the case the Field can not be generated out of the value. + */ + public Field createField(String strAttName, Object attValue) + { + DynamicFieldType fieldType = getFieldType(strAttName); + + return fieldType.createField(strAttName, attValue); + } + + + + + + public void fromJson(String strJson) + { + + try + { + FieldConfig fieldConfig = (FieldConfig) JsonReader.jsonToJava(strJson); + + this.defaultFieldType = fieldConfig.defaultFieldType; + + this.fieldName2FieldType = fieldConfig.fieldName2FieldType; + + + } + catch (IOException e) + { + Logger.getLogger(FieldConfig.class.getName()).log(Level.SEVERE, "Error", e); + } + + } + + + + /** + * Gets the field type for a specific field, as configured. In the case there is no explicit mapping for the field, the default type will be returned. + * + * @param strFieldName + * @return + */ + public DynamicFieldType getFieldType(String strFieldName) + { + DynamicFieldType fieldType = fieldName2FieldType.get(strFieldName); + + if(fieldType == null) fieldType = defaultFieldType; + + return fieldType; + } + + + + public String toJson(boolean bFormatIt) + { + try + { + + + HashMap hsOptions = new HashMap<>(); + hsOptions.put(JsonWriter.ENUM_PUBLIC_ONLY, true); + + String strJson = JsonWriter.objectToJson(this, hsOptions); + + + if(bFormatIt) strJson = JsonWriter.formatJson(strJson); + + // return strJson.replaceAll(",\\s*\"ordinal\":\\d+", ""); + return strJson; + + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + + +} diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/IndexAccessor.java b/src/main/java/de/dfki/km/leech/lucene/basic/IndexAccessor.java new file mode 100644 index 0000000..08d4a8e --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/IndexAccessor.java @@ -0,0 +1,1634 @@ +package de.dfki.km.leech.lucene.basic; + + + +import de.dfki.inquisitor.exceptions.ExceptionUtils; +import de.dfki.inquisitor.logging.LoggingUtils; +import de.dfki.inquisitor.text.StringUtils; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.*; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.store.NativeFSLockFactory; +import org.apache.lucene.util.Version; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.*; +import java.util.Map.Entry; +import java.util.logging.Level; +import java.util.logging.Logger; + + + +@SuppressWarnings({"JavaDoc", "PointlessBooleanExpression"}) +public class IndexAccessor +{ + + public static class BetterMultiReader extends MultiReader + { + + + public BetterMultiReader(IndexReader... subReaders) throws IOException + { + super(subReaders); + } + + + + public BetterMultiReader(IndexReader[] subReaders, boolean closeSubReaders) throws IOException + { + super(subReaders, closeSubReaders); + } + + + + public List getSubReaders() + { + return getSequentialSubReaders(); + } + } + + + + + /** + * Status constants for removeReaderFromCacheWhenPossible + * + * @author Christian Reuschling, Dipl.Ing.(BA) + */ + public static enum ReaderStatus { + READER_CLOSED, READER_IN_QUEUE, READER_NOT_IN_CACHE; + } + + + + + protected static class ReaderRefreshRunnable implements Runnable + { + + @Override + public void run() + { + + try + { + while (true) + { + + // wir warten das eingestellte Intervall + + // ich hatte mal die Situation, daß der Thread nur im korrekten Intervall ausgeführt wird, wenn hier vor dem Sleep noch eine + // Ausgabe steht - da das eigentlich nicht sein kann, und das nur zum debuggen relevant war, mach ich das mal wieder weg. Er kam + // dann, aber halt nicht so oft. Aber schon innerhalb 2min (und nicht 10ms, wie ich es da wollte) + // LinkedList dummy = new LinkedList(); + // System.err.print("."); + Thread.sleep(m_lReaderRefreshIntervall); + + Logger.getLogger(this.getClass().getName()).fine("will refresh all index readers"); + + IndexAccessor.refreshAllIndexReaders(); + } + + } + catch (Exception e) + { + throw new RuntimeException(e); + } + } + } + + + private static String m_strIdAttributeName; + + protected static Logger logger = Logger.getLogger(IndexAccessor.class.getName()); + + protected static Analyzer m_analyzer4writer; + + // protected static boolean m_bNativeFileLock = true; + + protected static HashMap m_hsIndexPathOrId2CurrentIndexReader = new HashMap(); + + // protected static HashMap m_hsIndexPathOrURL2CurrentRemoteSearcher = new HashMap(); + + // wenn man mehrere Instanzen von luceneIndexSet hat, darf trotzdem nur ein Writer pro Index offen sein + protected static HashMap m_hsIndexPathOrURL2Writer = new HashMap(); + + protected static HashMap m_hsIndexReader2IndexPath = new HashMap(); + + protected static HashMap m_hsIndexReader2ReaderRefCount = new HashMap(); + + + protected static HashMap m_hsIndexWriter2WriterRefCount = new HashMap(); + + + + + + + protected static HashSet m_hsReader2Remove = new HashSet(); + + + + protected static HashSet m_hsStaticIndexReaderSet = new HashSet(); + + + + + + protected static long m_lReaderRefreshIntervall = 1000 * 60 * 2; + + + + static + { + + try + { + + + + // wir starten den Thread, der die reader objekte refreshed + + Thread readerRefreshThread = new Thread(new ReaderRefreshRunnable(), "IndexAccessor reader refresh thread"); + readerRefreshThread.setDaemon(true); + // welche Priority? ich hatte mal das Gefühl, daß der recht selten dran kommt + // readerRefreshThread.setPriority(Thread.MIN_PRIORITY); + // readerRefreshThread.setPriority(Thread.MAX_PRIORITY); + readerRefreshThread.start(); + + + + // ein shutdown hook um sicherzustellen, daß auch alle Objekte geschlossen werden - wir wollen ja keine anderen Prozesse blockieren + + Runtime.getRuntime().addShutdownHook(new Thread() + { + @Override + public void run() + { + try + { + IndexAccessor.forceCloseAll(); + } + catch (Exception ex) + { + throw new RuntimeException(ex); + } + } + }); + + + } + catch (Exception e) + { + Logger.getLogger(IndexAccessor.class.getName()).log(Level.SEVERE, "Error", e); + } + + } + + + + + /** + * Adds a reader object to the cache. This reader will be static, which means that it won't be refreshed in any case, independent of which method you invoke on + * {@link IndexAccessor}, nor in the refresh-Thread. You can get this reader with {@link #getLuceneIndexReader(String, boolean)}, with strIndexID as parameter.You also can remove + * the reader from cache with {@link #removeReaderFromCache(String)}, {@link #removeReaderFromCacheWhenPossible(String)} and {@link #removeUnusedReadersFromCache()} + * + * + * @param strIndexID a unique ID for the reader + * @param staticReader the reader Object + */ + synchronized static public void addStaticReader(String strIndexID, IndexReader staticReader) + { + // wir merken uns den Reader, damit wir ihn nicht später aus Versehen ersetzen/refreshen + m_hsStaticIndexReaderSet.add(staticReader); + + // und mit seiner ID kommt er auch noch in den Cache + m_hsIndexPathOrId2CurrentIndexReader.put(strIndexID, staticReader); + } + + + + /** + * Creates a new, empty Lucene index under the given path + * + * @param strIndexPathOrURL the path for the new Lucene index. In the case the path does not exists, it will be created + * @param bForceAndOverwrite if this is false, the index will be only created in the case there is no existing index under strIndexPathOrURL + * + * @return true in the case the index was newly created, false otherwise. In the case strIndexPathOrURL exists and is a file, it will not created in any case + * + * @throws IOException + * @throws CorruptIndexException + */ + synchronized static public boolean createNewIndex(String strIndexPathOrURL, boolean bForceAndOverwrite) throws CorruptIndexException, IOException + { + boolean bCreateNew = false; + + File fIndexPath = new File(strIndexPathOrURL); + + if(!fIndexPath.exists()) + { + fIndexPath.mkdirs(); + + bCreateNew = true; + } + + FSDirectory dir = createFSDirectory(fIndexPath); + + if(bCreateNew == false && (!DirectoryReader.indexExists(dir) || bForceAndOverwrite)) + { + bCreateNew = true; + } + + if(!bCreateNew) return false; + + + + logger.fine("will open indexWriter for '" + strIndexPathOrURL + "'"); + + // wenn fäschlicherweise z.B. ein video-attachment als fulltext verarbeitet wird, haben wir riesige Docs, viel Speicher, lange Zeiten...aus + // diesem Grund setzte ich die MaxFieldLength mal wieder auf limited + @SuppressWarnings("deprecation") + IndexWriter ourIndexWriter = new IndexWriter(dir, new IndexWriterConfig(getDefaultAnalyzer()).setOpenMode(OpenMode.CREATE)); + + ourIndexWriter.close(); + + return true; + } + + + + + + // /** + // * Enable or disable native file locking. We recommend the native lock, which is also the default. + // * + // * @param bNativeFileLock true in the case you want to use native file OS locks. These could be problematic on NFS drives (see {@link NativeFSLockFactory}). I + // * recommend to use the native File lock (stress tests on our NFS system have shown that this is really an atomar, working lock - the other lock leads to + // * exceptions (at least in ealier versions of Lucene) + // */ + // static public void enableNativeFileLock(boolean bNativeFileLock) + // { + // m_bNativeFileLock = bNativeFileLock; + // } + + + + /** + * Gets the default analyzer that will be used for writer creation + * + * @return the default analyzer that will be used for writer creation + */ + static public Analyzer getDefaultAnalyzer() + { + return m_analyzer4writer; + } + + + + /** + * Gets the default attribute name that will be used for RemotIndexReader creation + * + * @return the default attribute name that will be used for RemotIndexReader creation + */ + static public String getDefaultIndexIdAttribute() + { + return IndexAccessor.m_strIdAttributeName; + } + + + // + // /** + // * Gets the reader for a given index path. The reader will be refreshed if there are any new changes in the index. In the case you pass an static reader ID to this + // * method, it will be identically to {@link #getIndexReader(String)}. You dont have to release a RemoteIndexReader. + // * + // * @param strIndexPathOrURL the path to the index where you want to read from + // * + // * @return the reader object that reflects the current state of the index + // * + // * @throws IOException + // * @throws CorruptIndexException + // * @throws URISyntaxException + // */ + // public synchronized static RemoteIndexReader getFreshIndexReader(String strIndexPathOrURL) throws CorruptIndexException, IOException, URISyntaxException + // { + // refreshIndexReader(strIndexPathOrURL, false); + // + // return getIndexReader(strIndexPathOrURL); + // } + + // + // + // /** + // * Gets the reader for the given index path. The reader will be created when necessary. In the case the specified directory does not exists or is empty, an empty + // * index will NOT be created.
+ // * Remark:
+ // * Note that refreshing a reader is a relatively expensive operation. The reader Object returned from this method must not reflect the current state of the index. To + // * get a guaranteed up to date, refreshed reader object, you have the following possibilities:
+ // *

  • invoke one of the methods {@link #refreshIndexReader(String)} or {@link #refreshAllIndexReaders()}
  • use the method {@link #getFreshIndexReader(String)} + // *

  • + // * You can also set a time intervall where all reader Objects will be refreshed for {@link #getLuceneIndexReader(String, boolean)} periodically with the method + // * {@link #setReaderRefreshIntervall(long)}
    + // * You dont have to release a RemoteIndexReader. + // * + // * @param strIndexPathOrURL the path to the index you wants to read from + // * + // * @return the index reader object + // * + // * @throws CorruptIndexException + // * @throws IOException + // * @throws URISyntaxException + // */ + // public synchronized static RemoteIndexReader getIndexReader(String strIndexPathOrURL) throws CorruptIndexException, IOException, URISyntaxException + // { + // return getIndexReader(strIndexPathOrURL, false); + // } + + + // + // /** + // * Gets the reader for the given index path. The reader will be created when necessary. In the case the specified directory does not exists or is empty, an empty + // * index will be created, if you want.
    + // * Remark:
    + // * Note that refreshing a reader is a relatively expensive operation. The reader Object returned from this method must not reflect the current state of the index. To + // * get a guaranteed up to date, refreshed reader object, you have the following possibilities:
    + // *
  • invoke one of the methods {@link #refreshIndexReader(String)} or {@link #refreshAllIndexReaders()}
  • use the method {@link #getFreshIndexReader(String)} + // *

  • + // * You can also set a time intervall where all reader Objects will be refreshed for {@link #getIndexReader(String, boolean)} periodically with the method + // * {@link #setReaderRefreshIntervall(long)}
    + // * You dont have to release a RemoteIndexReader. + // * + // * @param strIndexPathOrURL the path to the index you wants to read from. This can be a simple path 'e.g. /home/hitzliputzli' or with URI Syntax + // * ('file:\\/home/hitzliputzli'). In the case the specified protocoll is not of type 'file', and delight is in the classpath, the method tries to create a + // * delight client object. + // * @param bCreateIndexIfNotExist if true, the index will be created in the case he did not exist + // * + // * @return the index reader object + // * + // * @throws CorruptIndexException + // * @throws IOException + // * @throws URISyntaxException + // */ + // synchronized static public RemoteIndexReader getIndexReader(String strIndexPathOrURL, boolean bCreateIndexIfNotExist) throws CorruptIndexException, IOException, + // URISyntaxException + // { + // + // RemoteIndexReader remoteIndexReader; + // + // + // if(isLocalPath(strIndexPathOrURL)) + // { + // // lokal - wir rufen einfach die entsprechene LuceneReader-Methode einmal auf, um das Objekt intern zu erstellen + // IndexReader luceneIndexReader = getLuceneIndexReader(strIndexPathOrURL, bCreateIndexIfNotExist); + // releaseLuceneIndexReader(luceneIndexReader); + // + // // das zugrundeliegende Objekt wurde initialisiert, nun einfach den String/Pfad basierten 'wrapper' + // remoteIndexReader = new RemoteIndexReaderImpl(strIndexPathOrURL, m_strIdAttributeName); + // } + // else + // { + // // wir versuchen, eine Verbindung zu einem RemoteReader aufzubauen + // strIndexPathOrURL = strIndexPathOrURL.replaceAll("/$", ""); + // String strHandlerName = strIndexPathOrURL.substring(strIndexPathOrURL.lastIndexOf('/') + 1) + "_reader"; + // String strServiceUrl = strIndexPathOrURL.replaceAll("/[^/]+$", ""); + // + // + // remoteIndexReader = delight.connectingTo(strServiceUrl).usingApi(strHandlerName, RemoteIndexReader.class); + // } + // + // + // return remoteIndexReader; + // } + + + + + + /** + * Gets all index paths that are currently inside the reader cache + * + * @return all index paths that are currently inside the reader cache + */ + public static Set getIndexReaderPathsAndIDs() + { + return m_hsIndexPathOrId2CurrentIndexReader.keySet(); + } + + + // + // synchronized static public RemoteIndexSearcher getIndexSearcher(String strIndexPathOrURL) throws CorruptIndexException, IOException, URISyntaxException + // { + // RemoteIndexSearcher searcher4Index; + // + // + // if(isLocalPath(strIndexPathOrURL)) + // { + // + // // lokal - wir rufen einfach die entsprechene LuceneReader-Methode einmal auf, um das Objekt intern zu erstellen + // IndexReader luceneIndexReader = getLuceneIndexReader(strIndexPathOrURL, false); + // releaseLuceneIndexReader(luceneIndexReader); + // + // // das zugrundeliegende Objekt wurde initialisiert, nun einfach den String/Pfad basierten 'wrapper' + // searcher4Index = new RemoteIndexSearcherImpl(strIndexPathOrURL, m_strIdAttributeName); + // } + // else + // { + // + // // es gibt zumindest keinen lokalen Index - dann könnte es noch eine remotegeschichte sein + // + // searcher4Index = m_hsIndexPathOrURL2CurrentRemoteSearcher.get(strIndexPathOrURL); + // if(searcher4Index == null) + // { + // + // logger.fine("will create new remote searcher for index '" + strIndexPathOrURL + "'"); + // + // strIndexPathOrURL = strIndexPathOrURL.replaceAll("/$", ""); + // String strHandlerName = strIndexPathOrURL.substring(strIndexPathOrURL.lastIndexOf('/') + 1) + "_searcher"; + // String strServiceUrl = strIndexPathOrURL.replaceAll("/[^/]+$", ""); + // + // + // searcher4Index = delight.connectingTo(strServiceUrl).usingApi(strHandlerName, RemoteIndexSearcher.class); + // + // + // m_hsIndexPathOrURL2CurrentRemoteSearcher.put(strIndexPathOrURL, searcher4Index); + // } + // } + // + // + // return searcher4Index; + // } + // + + + /** + * Gets a writer instance for an index. DON'T !!!!! close your writer afterwards - use the >>>>> releaseIndexWriter(..) <<<<< method instead, and make SURE not to + * forget this. The close will be done automatically, and you would permit any other threads to work with the index by doing this. The default analyzer will be used
    + * In the case the specified directory does not exists or is empty, an empty index will be created.
    + * Remark:
    + * You can change the timeout Lucene waits for getting write access by setting IndexWriter.WRITE_LOCK_TIMEOUT
    + * It is in almost any case I can imagine no good idea to have an IndexWriter member variable that refers on the reference from this method. This will block all other + * processes that wants to get access to the index. You can make this in a short-living Object, but know exactly what yo do... + * + * @param strIndexPathOrURL the path to the index + * + * @return a writer instance for the given index. Autocommit will be FALSE. + * + * @throws CorruptIndexException + * @throws LockObtainFailedException + * @throws IOException + */ + synchronized static public IndexWriter getIndexWriter(String strIndexPathOrURL) throws CorruptIndexException, LockObtainFailedException, IOException + { + if(getDefaultAnalyzer() == null) logger.severe("default analyzer is not set - this will cause a Nullpointer Exception. Set it before creating an IndexWriter."); + return getIndexWriter(strIndexPathOrURL, getDefaultAnalyzer()); + } + + + + /** + * Gets a writer instance for an index. DON'T !!!!! close your writer afterwards - use the >>>>> releaseWriter4DefaultIndex() <<<<< method instead, and make SHURE not + * to forget this. The close will be done automatically, and you would permit any other threads to work with the index by doing this
    + * In the case the specified directory does not exists or is empty, an empty index will be created.
    + * Remark:
    + * You can change the timeout Lucene waits for getting write access by setting IndexWriter.WRITE_LOCK_TIMEOUT
    + * It is in almost any case I can imagine no good idea to have an IndexWriter member variable that refers on the reference from this method. This will block all other + * processes that wants to get access to the index. You can make this in a short-living Object, but know exactly what yo do... + * + * @param strIndexPathOrURL the path to the index + * @param analyzer the Lucene analyzer that should be used for this writer creation + * + * @return a writer instance for the given index. Autocommit will be FALSE. + * + * @throws CorruptIndexException + * @throws LockObtainFailedException + * @throws IOException + */ + @SuppressWarnings("deprecation") + synchronized static public IndexWriter getIndexWriter(String strIndexPathOrURL, Analyzer analyzer) throws CorruptIndexException, LockObtainFailedException, + IOException + { + + // Haben wir schon einen geöffneten Writer? + IndexWriter ourIndexWriter = m_hsIndexPathOrURL2Writer.get(strIndexPathOrURL); + + + // wenn nicht, machen wir doch einen neuen + if(ourIndexWriter == null) + { + // wenn es ein leeres directory ist oder es nicht existiert, dann machen wir auch gleich einen neuen Index + createNewIndex(strIndexPathOrURL, false); + + FSDirectory dir = createFSDirectory(new File(strIndexPathOrURL)); + + logger.fine("will open indexWriter for '" + strIndexPathOrURL + "'"); + + ourIndexWriter = new IndexWriter(dir, new IndexWriterConfig( analyzer).setOpenMode(OpenMode.APPEND)); + + m_hsIndexPathOrURL2Writer.put(strIndexPathOrURL, ourIndexWriter); + } + + // wir verwalten Tokens - diese müssen wieder mit releaseWriter freigegeben werden + Integer iOld = m_hsIndexWriter2WriterRefCount.get(ourIndexWriter); + if(iOld == null) + m_hsIndexWriter2WriterRefCount.put(ourIndexWriter, 1); + else + m_hsIndexWriter2WriterRefCount.put(ourIndexWriter, ++iOld); + + if(logger.isLoggable(Level.FINEST)) logger.finest("get indexWriter for '" + strIndexPathOrURL + "'\n" + LoggingUtils.getCurrentStackTrace()); + + return ourIndexWriter; + } + + + + /** + * Gets all index paths that are currently inside the writer cache + * + * @return all index paths that are currently inside the writer cache + */ + public static Set getIndexWriterPaths() + { + return m_hsIndexPathOrURL2Writer.keySet(); + } + + + + /** + * This is an expert method - the use of RemoteIndexReader is recommended. Gets the reader for the given index path. The reader will be created when necessary. In the + * case the specified directory does not exists or is empty, an empty index will be created, if you want.
    + * Remark:
    + * Note that refreshing a reader is a relatively expensive operation. The reader Object returned from this method must not reflect the current state of the index. To + * get a guaranteed up to date, refreshed reader object. You have the following possibilities:
    + *
  • invoke one of the methods {@link #refreshIndexReader(String)} or {@link #refreshAllIndexReaders()}
  • + *

  • + * You can also set a time intervall where all reader Objects will be refreshed for {@link #getLuceneIndexReader(String, boolean)} periodically with the method + * {@link #setReaderRefreshIntervall(long)}
    + * Don't forget to release your reader Object with {@link #releaseLuceneIndexReader(IndexReader)} + * + * @param strIndexPathOrURL the path to the index you wants to read from. This can be a simple path 'e.g. /home/hitzliputzli' or with URI Syntax + * ('file:\\/home/hitzliputzli'). + * @param bCreateIndexIfNotExist if true, the index will be created in the case he did not exist + * + * @return the index reader object + * + * @throws CorruptIndexException + * @throws IOException + * @throws URISyntaxException + */ + synchronized static public IndexReader getLuceneIndexReader(String strIndexPathOrURL, boolean bCreateIndexIfNotExist) throws CorruptIndexException, IOException, + URISyntaxException + { + IndexReader reader = m_hsIndexPathOrId2CurrentIndexReader.get(strIndexPathOrURL); + + // wenn wir noch keinen haben, dann erstellen wir uns einen + if(reader == null) + { + + logger.fine("will create new reader for index '" + strIndexPathOrURL + "'"); + + + File fIndex = null; + // die super-URI-Implementierung nimmt echt alles an, was auch keine Uri ist, ohne eine syntaxException - insbesondere einen Pfad :( + + if(strIndexPathOrURL.startsWith("file:")) + fIndex = new File(new URI(strIndexPathOrURL)); + else + fIndex = new File(strIndexPathOrURL); + + + + // wenn es ein leeres directory ist oder es nicht existiert, dann machen wir auch gleich einen neuen Index + if(bCreateIndexIfNotExist) createNewIndex(strIndexPathOrURL, false); + + Directory dir = createFSDirectory(fIndex); + + + reader = DirectoryReader.open(dir); + + + // hier steht immer der neueste drin - die alten werden in der release-methode wieder zu gemacht + m_hsIndexPathOrId2CurrentIndexReader.put(strIndexPathOrURL, reader); + } + + + // das Token wird für diesen Index inkrementiert + Integer iOld = m_hsIndexReader2ReaderRefCount.get(reader); + if(iOld == null) + { + m_hsIndexReader2ReaderRefCount.put(reader, 1); + m_hsIndexReader2IndexPath.put(reader, strIndexPathOrURL); + } + else + m_hsIndexReader2ReaderRefCount.put(reader, ++iOld); + + + if(logger.isLoggable(Level.FINEST)) logger.finest("get reader for index '" + strIndexPathOrURL + "'\n" + LoggingUtils.getCurrentStackTrace()); + + return reader; + } + + + + synchronized static public IndexSearcher getLuceneIndexSearcher(String strIndexPathOrURL) throws CorruptIndexException, IOException, URISyntaxException + { + logger.fine("will create new searcher for index '" + strIndexPathOrURL + "'"); + + IndexSearcher searcher4Index = new IndexSearcher(getLuceneIndexReader(strIndexPathOrURL, false)); + + + + return searcher4Index; + } + + + + synchronized static public IndexSearcher getLuceneMultiSearcher(LinkedHashSet sIndexPathsOrURLs) throws CorruptIndexException, IOException, + URISyntaxException + { + logger.fine("will create new searcher for index '" + sIndexPathsOrURLs + "'"); + + IndexSearcher searcher4Index = new IndexSearcher(getLuceneMultiReader(sIndexPathsOrURLs, false)); + + + + return searcher4Index; + } + + + + /** + * Gets the lucene MultiReader for all given LOCAL reader paths (paths that point to the file system, not to a remote index). The readers will be created when + * necessary. In the case a specified directory does not exist or is empty, an empty index will be created, if you want.
    + * Remark:
    + * Note that refreshing a reader is a relatively expensive operation. The reader Object returned from this method must not reflect the current state of the index. To + * get a guaranteed up to date, refreshed reader object, you have the following possibilities:
    + *
  • invoke one of the methods {@link #refreshIndexReader(String)} or {@link #refreshAllIndexReaders()}
  • + *

  • + * You can also set a time intervall where all reader Objects will be refreshed for {@link #getLuceneIndexReader(String, boolean)} periodically with the method + * {@link #setReaderRefreshIntervall(long)}
    + * You dont have to release a RemoteIndexReader. + * + * @param sIndexPathsOrURLs the paths to the indices you want to read from. This can be a simple path 'e.g. /home/hitzliputzli' or with URI Syntax + * ('file:\\/home/hitzliputzli'). In the case the specified protocoll is not of type 'file', + * @param bCreateIndexIfNotExist if true, the index will be created in the case he did not exist + * + * @return the index reader object + * + * @throws CorruptIndexException + * @throws IOException + * @throws URISyntaxException + */ + synchronized static public MultiReader getLuceneMultiReader(LinkedHashSet sIndexPathsOrURLs, boolean bCreateIndexIfNotExist) throws CorruptIndexException, + IOException, URISyntaxException + { + + + LinkedList lReaders = new LinkedList<>(); + for (String strIndexPathOrUrl : sIndexPathsOrURLs) + { + + if(isLocalPath(strIndexPathOrUrl)) + { + // lokal - wir rufen einfach die entsprechene LuceneReader-Methode einmal auf, um das Objekt intern zu erstellen + IndexReader luceneIndexReader = getLuceneIndexReader(strIndexPathOrUrl, bCreateIndexIfNotExist); + + + lReaders.add(luceneIndexReader); + } + else + { + // ignore + } + + } + + + BetterMultiReader multiReader = new BetterMultiReader(lReaders.toArray(new IndexReader[0]), false); + + + return multiReader; + } + + + // + // /** + // * Gets a MultiReader that wrapps all index readers for the given Set of index paths. You dont have to release a RemoteIndexReader. + // * + // * @param indexPathsOrIDs2CreateIfNotExist the set of indices that should be wrapped by the MultiReader. The last reader in the list will be stable with respect to + // * write modifications during the livetime of this MultiReader, because the documents index number will stay stable in this index. For each index, you can + // * specify whether she should be created or not in the case it not exists. + // * + // * @return a MultiReader the wrapps all index readers for the given Set of index paths. You dont have to release a RemoteIndexReader. + // * + // * @throws CorruptIndexException + // * @throws IOException + // */ + // public synchronized static RemoteIndexReader getMultiIndexReader(LinkedHashMap indexPathsOrIDs2CreateIfNotExist) throws CorruptIndexException, + // IOException + // { + // + // // wir trennen die lokalen von den remote-URLs. Mit den lokalen machen wir EINEN LuceneMultiReader, und dann packen wir die remotes dazu + // + // // Wir trennen in remote-und lokale Indizes + // LinkedList lLocalIndices = new LinkedList<>(); + // LinkedList lRemoteIndices = new LinkedList<>(); + // + // for (Entry strIndexPathOrURL2CreateIfNotExist : indexPathsOrIDs2CreateIfNotExist.entrySet()) + // { + // + // String strIndexPathOrURL = strIndexPathOrURL2CreateIfNotExist.getKey(); + // Boolean bCreateIfNotExist = strIndexPathOrURL2CreateIfNotExist.getValue(); + // + // if(isLocalPath(strIndexPathOrURL)) + // { + // lLocalIndices.add(strIndexPathOrURL); + // if(bCreateIfNotExist) createNewIndex(strIndexPathOrURL, false); + // } + // else + // { + // lRemoteIndices.add(strIndexPathOrURL); + // } + // } + // + // + // LinkedList llReaderz = new LinkedList(); + // + // // der lokale MultiReader + // de.dfki.inquisition.lucene.RemoteIndexReader localReader = new RemoteIndexReaderImpl(lLocalIndices.toArray(new String[0])); + // localReader.setIdAttributename(m_strIdAttributeName); + // llReaderz.add(localReader); + // + // + // // die remote reader + // for (String strRemoteURL : lRemoteIndices) + // { + // + // try + // { + // // index creation is of no sense when we have a remote reader anyway + // de.dfki.inquisition.lucene.RemoteIndexReader reader = getIndexReader(strRemoteURL, false); + // // check if this reader is available + // reader.numDocs(); + // + // llReaderz.add(reader); + // } + // catch (Exception e) + // { + // logger.log(Level.SEVERE, "Exception while creating a remote index reader. The index '" + strRemoteURL + "' will be ignored. ('" + e.getMessage() + "')"); + // logger.log(Level.FINE, "Exception for index '" + strRemoteURL + "': ", e); + // } + // } + // + // + // // und daraus erzeugen wir uns jetzt nen MultiReader + // if(llReaderz.size() == 1) return llReaderz.get(0); + // + // RemoteMultiIndexReader multiReader = new RemoteMultiIndexReader(llReaderz.toArray(new de.dfki.inquisition.lucene.RemoteIndexReader[0])); + // + // + // return multiReader; + // } + + + + // /** + // * Gets a MultiReader that wrapps all index readers for the given Set of index paths. You dont have to release a RemoteIndexReader. + // * + // * @param indexPathsOrIDs the set of indices that should be wrapped by the MultiReader. The last reader in the list will be stable with respect to write modifications + // * during the livetime of this MultiReader, because the documents index number will stay stable in this index. For each index, the index will NOT created + // * in the case it does not exists + // * + // * @return a MultiReader the wrapps all index readers for the given Set of index paths. You dont have to release a RemoteIndexReader. + // * + // * @throws CorruptIndexException + // * @throws IOException + // */ + // public synchronized static RemoteIndexReader getMultiIndexReader(LinkedHashSet indexPathsOrIDs) throws CorruptIndexException, IOException + // { + // return getMultiIndexReader(indexPathsOrIDs, false); + // } + + // + // + // /** + // * Gets a MultiReader that wrapps all index readers for the given Set of index paths. You dont have to release a RemoteIndexReader. + // * + // * @param indexPathsOrIDs the set of indices that should be wrapped by the MultiReader. The last reader in the list will be stable with respect to write modifications + // * during the livetime of this MultiReader, because the documents index number will stay stable in this index. For each index, the index will NOT created + // * in the case it does not exists (beside the last one if you want it) + // * @param bCreateLastIndexInListIfNotExist if true, the last index in the list will be created in the case it does not exist + // * + // * @return a MultiReader the wrapps all index readers for the given Set of index paths. You dont have to release a RemoteIndexReader. + // * + // * @throws CorruptIndexException + // * @throws IOException + // */ + // public synchronized static RemoteIndexReader getMultiIndexReader(LinkedHashSet indexPathsOrIDs, boolean bCreateLastIndexInListIfNotExist) + // throws CorruptIndexException, IOException + // { + // LinkedHashMap hsIndexPathsOrIDs2CreateIfNotExist = new LinkedHashMap(); + // + // + // int i = 0; + // for (String strIndexPathOrURL : indexPathsOrIDs) + // { + // boolean bCreateIfNotExist = false; + // if(i == indexPathsOrIDs.size() - 1) bCreateIfNotExist = bCreateLastIndexInListIfNotExist; + // + // hsIndexPathsOrIDs2CreateIfNotExist.put(strIndexPathOrURL, bCreateIfNotExist); + // + // i++; + // } + // + // return getMultiIndexReader(hsIndexPathsOrIDs2CreateIfNotExist); + // } + + + // + // /** + // * Gets a MultiReader that wrapps all currently cached index readers. You dont have to release a RemoteIndexReader. + // * + // * @param strLastIndexInListPathOrID this will be the last reader in the list of reader offered to the MultiReader Constructor. In this index you can write and read + // * in parallel, because the document numbers will not change during writing (until index optimization). In the case you don't write to any index, the order + // * is irrelevant and you can set this paraeter simply null + // * + // * @return a MultiReader that wrapps all currently cached index readers. You dont have to release a RemoteIndexReader. + // * + // * @throws CorruptIndexException + // * @throws IOException + // */ + // public synchronized static RemoteMultiIndexReader getMultiIndexReader(String strLastIndexInListPathOrID) throws CorruptIndexException, IOException + // { + // return getMultiIndexReader(strLastIndexInListPathOrID, false); + // } + + + + // /** + // * Gets a MultiReader that wrapps all currently cached index readers. Don't forget to release it with {@link #releaseLuceneIndexReader(IndexReader)} + // * + // * @param strLastIndexInListPathOrID this will be the last reader in the list of reader offered to the MultiReader Constructor. In this index you can write and read + // * in parallel, because the document numbers will not change during writing (until index optimization). In the case you don't write to any index, the order + // * is irrelevant and you can set this paraeter simply null + // * @param bCreateLastIndexInListIfNotExist if true, the last index in the list will be created in the case it does not exist + // * + // * @return a MultiReader that wrapps all currently cached index readers. You dont have to release a RemoteIndexReader. + // * + // * @throws CorruptIndexException + // * @throws IOException + // */ + // public synchronized static RemoteMultiIndexReader getMultiIndexReader(String strLastIndexInListPathOrID, boolean bCreateLastIndexInListIfNotExist) + // throws CorruptIndexException, IOException + // { + // LinkedList llReaderz = new LinkedList(); + // + // + // // der reader, auf den auch schreibend zugegriffen werden kann, machen wir am Schluß rein - ich habe die Hoffnung, + // // daß sich dann nicht die docIDs verschieben, wenn gleichzeitig geschrieben und in diesem und in externen Indices + // // gesucht wird...die externen müssen halt readonly sein...und des funzt auch :) + // + // + // HashSet hsIndexPaths = new HashSet(); + // hsIndexPaths.addAll(getIndexReaderPathsAndIDs()); + // + // // aaalso. wir erstellen alle Readers, und für den letzten wird das Flag eingesetzt... + // for (String strIndexPathOrURL : hsIndexPaths) + // { + // + // boolean bIsLast = strIndexPathOrURL.equals(strLastIndexInListPathOrID); + // + // try + // { + // + // de.dfki.inquisition.lucene.RemoteIndexReader reader; + // if(bIsLast) + // reader = getIndexReader(strIndexPathOrURL, bCreateLastIndexInListIfNotExist); + // else + // reader = getIndexReader(strIndexPathOrURL, false); + // + // + // if(strLastIndexInListPathOrID == null || llReaderz.size() == 0 || bIsLast) + // llReaderz.addLast(reader); + // else + // llReaderz.addFirst(reader); + // + // } + // catch (Exception e) + // { + // logger.log(Level.SEVERE, "Exception while creating a MultiReader. The index '" + strIndexPathOrURL + "' will be ignored. ('" + e.getMessage() + "')"); + // logger.log(Level.FINE, "Exception for index '" + strIndexPathOrURL + "': ", e); + // } + // } + // + // + // // und daraus erzeugen wir uns jetzt nen MultiReader + // RemoteMultiIndexReader multiReader = new RemoteMultiIndexReader(llReaderz.toArray(new RemoteIndexReader[0])); + // + // + // return multiReader; + // } + + + + // synchronized static public RemoteIndexSearcher getMultiIndexSearcher(LinkedHashSet indexPathsOrURLs) throws IOException, URISyntaxException + // { + // + // // - wir erzeugen uns einen searcher aus jeder Quelle - ganz einfach mit getIndexSearcher. Da wird dann auch die Unterscheidung zwischen + // // lokal- und remoteSearcher gemacht. + // // - wir nehmen den wunderschönen ParallelMultiSearcher - verteilte document frequency + multithreaded Suche....sehr schön :)...den gibts nicht mehr :( + // + // + // + // // Wir trennen in remote-und lokale Indizes + // LinkedList lLocalIndices = new LinkedList<>(); + // LinkedList lRemoteIndices = new LinkedList<>(); + // + // for (String strIndexPathOrURL : indexPathsOrURLs) + // { + // if(isLocalPath(strIndexPathOrURL)) + // { + // lLocalIndices.add(strIndexPathOrURL); + // } + // else + // { + // lRemoteIndices.add(strIndexPathOrURL); + // } + // } + // + // + // LinkedList llSearcherz = new LinkedList(); + // + // // der lokale MultiSearcher + // RemoteIndexSearcherImpl localSearcher = new RemoteIndexSearcherImpl(lLocalIndices.toArray(new String[0])); + // localSearcher.setIdAttributename(m_strIdAttributeName); + // llSearcherz.add(localSearcher); + // + // + // // die remote reader + // for (String strRemoteURL : lRemoteIndices) + // { + // + // try + // { + // RemoteIndexSearcher searcher = getIndexSearcher(strRemoteURL); + // + // // check if the remote index is up and running + // searcher.maxDoc(); + // + // llSearcherz.add(searcher); + // } + // catch (Exception e) + // { + // logger.log(Level.SEVERE, "Exception while creating a MultiSearcher. The index '" + strRemoteURL + "' will be ignored. ('" + e.getMessage() + "')"); + // logger.log(Level.FINE, "Exception for index '" + strRemoteURL + "': ", e); + // } + // } + // + // + // // und daraus erzeugen wir uns jetzt nen MultiSearcer + // if(llSearcherz.size() == 1) return llSearcherz.get(0); + // + // RemoteMultiIndexSearcher multiSearcher = new RemoteMultiIndexSearcher(llSearcherz.toArray(new RemoteIndexSearcher[0])); + // + // + // return multiSearcher; + // + // + // + // // + // // + // // + // // + // // LinkedList llSearchables = new LinkedList(); + // // + // // for (String strIndexPathOrURL : indexPathsOrURLs) + // // { + // // try + // // { + // // + // // RemoteIndexSearcher searcher = getIndexSearcher(strIndexPathOrURL); + // // llSearchables.add(searcher); + // // + // // } + // // catch (Exception e) + // // { + // // logger.log(Level.SEVERE, "Exception while creating a MultiSearcher. The index '" + strIndexPathOrURL + "' will be ignored. ('" + e.getMessage() + "')"); + // // logger.log(Level.FINE, "Exception for index '" + strIndexPathOrURL + "': ", e); + // // } + // // } + // // + // // + // // RemoteMultiIndexSearcher searcher = new RemoteMultiIndexSearcher(llSearchables.toArray(new RemoteIndexSearcher[0])); + // // + // // + // // return searcher; + // } + + + // + // synchronized static public RemoteIndexSearcher getMultiIndexSearcher(String strLastIndexInListPathOrID) throws IOException, URISyntaxException + // { + // + // LinkedList llIndices = new LinkedList(); + // + // + // // der reader, auf den auch schreibend zugegriffen werden kann, machen wir am Schluß rein - ich habe die Hoffnung, + // // daß sich dann nicht die docIDs verschieben, wenn gleichzeitig geschrieben und in diesem und in externen Indices + // // gesucht wird...die externen müssen halt readonly sein...und des funzt auch :) + // + // + // HashSet hsIndexPaths = new HashSet(); + // hsIndexPaths.addAll(getIndexReaderPathsAndIDs()); + // + // // aaalso. wir erstellen alle Readers, und für den letzten wird das Flag eingesetzt... + // for (String strIndexPathOrURL : hsIndexPaths) + // { + // + // boolean bIsLast = strIndexPathOrURL.equals(strLastIndexInListPathOrID); + // + // if(strLastIndexInListPathOrID == null || llIndices.size() == 0 || bIsLast) + // llIndices.addLast(strIndexPathOrURL); + // else + // llIndices.addFirst(strIndexPathOrURL); + // } + // + // + // return getMultiIndexSearcher(new LinkedHashSet(llIndices)); + // } + + + + + + /** + * Gets the time intervall all reader objects will be refreshed automatically. After a refresh, all Objects from subsequent calls of {@link #getLuceneIndexReader(String, boolean)} + * will reflect the current state of an index, with any changes done. + * + * @return the reader refresh time intervall + */ + static public long getReaderRefreshIntervall() + { + return m_lReaderRefreshIntervall; + } + + + + // /** + // * Gets whether native file locking is enabled or not + // * + // * @return whether native file locking is enabled or not + // */ + // static public boolean isNativeFileLockEnabled() + // { + // return m_bNativeFileLock; + // } + + + + + /** + * Returns true in the case a reader object for a given index path is inside the cache + * + * @param strIndexPathOrURL the index path for the reader object + * + * @return true in the case a reader object for the given index path is inside the cache + */ + static public boolean isReaderInCache(String strIndexPathOrURL) + { + return m_hsIndexPathOrId2CurrentIndexReader.containsKey(strIndexPathOrURL); + } + + + + /** + * Refreshs all index readers + * + * @throws CorruptIndexException + * @throws IOException + * @throws URISyntaxException + */ + synchronized static public void refreshAllIndexReaders() throws CorruptIndexException, IOException, URISyntaxException + { + LinkedList llKeys = new LinkedList(); + llKeys.addAll(m_hsIndexPathOrId2CurrentIndexReader.keySet()); + + for (String strIndexPathOrURL : llKeys) + refreshIndexReader(strIndexPathOrURL); + + } + + + + + + + + /** + * Refreshs an index reader for a given path. In the case the indexReader was not formerly created by {@link #getLuceneIndexReader(String, boolean)}, it will be + * created. In the case you will pass the ID of a static Reader, the method will do nothing. + * + * @param strIndexPath the path to the lucene index + * + * @throws CorruptIndexException + * @throws IOException + * @throws URISyntaxException + */ + synchronized static public void refreshIndexReader(String strIndexPath) throws CorruptIndexException, IOException, URISyntaxException + { + refreshIndexReader(strIndexPath, false); + } + + + + // static public boolean isLocalPath(String strIndexPathOrURL) + // { + // try + // { + // + // if(strIndexPathOrURL == null) return false; + // + // File fIndex = null; + // // die super-URI-Implementierung nimmt echt alles an, was auch keine Uri ist, ohne eine syntaxException - insbesondere einen Pfad :( + // + // if(strIndexPathOrURL.startsWith("file:")) + // + // fIndex = new File(new URI(strIndexPathOrURL)); + // else + // fIndex = new File(strIndexPathOrURL); + // + // + // if(fIndex.exists()) return true; + // + // return false; + // + // + // } + // catch (URISyntaxException e) + // { + // return false; + // } + // + // } + + + + /** + * Refreshs an index reader for a given path. In the case the indexReader was not formerly created by {@link #getLuceneIndexReader(String, boolean)}, it will be + * created. In the case the index does not exist, it will be created, if you want. In the case you will pass the ID of a static Reader, the method will do nothing. + * + * @param strIndexPath the path to the lucene index + * @param bCreateIndexIfNotExist if true, the index will be created in the case he did not exist + * + * @throws CorruptIndexException + * @throws IOException + * @throws URISyntaxException + */ + synchronized static public void refreshIndexReader(String strIndexPath, boolean bCreateIndexIfNotExist) throws CorruptIndexException, IOException, URISyntaxException + { + + // haben wir schon einen? + IndexReader readerOld = getLuceneIndexReader(strIndexPath, bCreateIndexIfNotExist); + + // wenn es ein statischer Reader ist, dann wird der ned refreshed + if(m_hsStaticIndexReaderSet.contains(readerOld)) return; + // wenn es kein DirectoryReader ist, können wir ihn nicht refreshen + if(!(readerOld instanceof DirectoryReader)) return; + DirectoryReader dirReader = (DirectoryReader) readerOld; + + try + { + if(dirReader.isCurrent()) return; + + logger.info("will refresh reader for index '" + strIndexPath + "'"); + + // den neuen erstellen + // Directory dir = createFSDirectory(new File(strIndexPath)); + // + // if(m_bLoadReadersInMemory) dir = new RAMDirectory(dir); + // + // IndexReader readerNew = IndexReader.open(dir, true); + IndexReader readerNew = DirectoryReader.openIfChanged(dirReader); + + + // hier steht immer der neueste drin - die alten werden in der release-methode wieder zu gemacht + m_hsIndexPathOrId2CurrentIndexReader.put(strIndexPath, readerNew); + + } + catch (org.apache.lucene.store.AlreadyClosedException e) + { + logger.warning("reader for '" + strIndexPath + "' was closed at refresh time"); + } + finally + { + // der alte Reader wird dann geschlossen, wenn er nicht mehr gebraucht wird + releaseLuceneIndexReader(readerOld); + } + + } + + + + + + /** + * Release your indexWriter that you get with getIndexWriter - in any case. In the case the IndexWriter is no more needed by some Instance, it will be commited and + * closed. + * + * @param indexWriter the writer Object that should be released + */ + synchronized static public void releaseIndexWriter(IndexWriter indexWriter) + { + try + { + // wir dekrementieren den count für den aktuellen Index + Integer iOld = m_hsIndexWriter2WriterRefCount.get(indexWriter); + if(iOld == null || iOld == 0) + { + logger.warning("have no writer index token for '" + indexWriter + "'"); + return; + } + + // das müssen wir an dieser Stelle machen - wenn der writer geclosed ist, dann wirft getDirectory eine Exception + if(!(indexWriter.getDirectory() instanceof FSDirectory)) throw new IllegalStateException("Directory is not of type FSDirectory"); + + String strIndexPathOrURL = ((FSDirectory) indexWriter.getDirectory()).getDirectory().toAbsolutePath().toString(); + + + int iNew = --iOld; + + String strDontCloseIndexWriters = System.getProperty("de.dfki.inquisition.lucene.IndexAccessor.DontCloseIndexWriters"); + boolean bIgnoreClose = false; + if(strDontCloseIndexWriters != null) bIgnoreClose = Boolean.parseBoolean(strDontCloseIndexWriters); + + if(iNew == 0 && !bIgnoreClose) + { + // wenn wir bei 0 sind, dann mache mer des Ding gleich zu + Set> entrySet = m_hsIndexPathOrURL2Writer.entrySet(); + Iterator> itEntries = entrySet.iterator(); + while (itEntries.hasNext()) + { + Entry entry = itEntries.next(); + if(entry.getValue().equals(indexWriter)) itEntries.remove(); + } + + + m_hsIndexWriter2WriterRefCount.remove(indexWriter); + + + logger.fine("will close indexWriter for '" + strIndexPathOrURL + "'"); + + indexWriter.commit(); + if(isLocalPath(strIndexPathOrURL)) indexWriter.close(); + } + else + m_hsIndexWriter2WriterRefCount.put(indexWriter, iNew); + + if(logger.isLoggable(Level.FINEST)) + { + if(bIgnoreClose) + logger.finest("indexWriter '" + strIndexPathOrURL + "' released - closing IGNORED (writer is still open)\n" + LoggingUtils.getCurrentStackTrace()); + else + logger.finest("indexWriter '" + strIndexPathOrURL + "' released\n" + LoggingUtils.getCurrentStackTrace()); + } + + } catch (IOException e) + { + logger.severe(ExceptionUtils.createStackTraceString(e)); + } + } + + + + /** + * This is an expert method - the use of RemoteIndexReader is recommended (You don't need to release it). Releases your reader Object in the case you don't need it + * anymore. In the case every instance has released a specific index path, the reader object will be closed. + * + * @param reader the IndexReader Object you gets formerly with IndexAccessor + */ + synchronized static public void releaseLuceneIndexReader(IndexReader reader) + { + + try + { + + if(reader instanceof BetterMultiReader) + { + for (IndexReader subReader : ((BetterMultiReader) reader).getSubReaders()) + releaseLuceneIndexReader(subReader); + + return; + } + + + String strIndexPathOrURL4Reader = m_hsIndexReader2IndexPath.get(reader); + if(strIndexPathOrURL4Reader == null) + logger.severe("have no path entry for reader. This is a hint to an error, e.g. you have released the reader too often, or the reader was not created with IndexAccessor."); + + + Integer iOldRefCount = m_hsIndexReader2ReaderRefCount.get(reader); + + if(iOldRefCount == null || iOldRefCount == 0) + { + logger.warning("have no reader index token for '" + strIndexPathOrURL4Reader + "'"); + return; + } + + int iNew = --iOldRefCount; + + if(iNew == 0) + { + // wenn wir bei 0 sind, dann mache mer des Ding gleich zu - wenn es nicht noch im Cache bleiben soll + m_hsIndexReader2ReaderRefCount.remove(reader); + m_hsIndexReader2IndexPath.remove(reader); + + // wir schliessen den nur, wenn es nicht der aktuelle aus der hashmap ist - ansonsten müssten wir ihn ständig wieder neu erzeugen. + // der aktuelle wir dann geschlossen, wenn es einen neueren gibt oder explizit mit removeReaderFromCache + + // wenn vorher gesagt wurde (mit removeReaderFromCacheWhenPossible), daß des Teil geschlossen werden soll, machen wir es auch zu + + if(!m_hsIndexPathOrId2CurrentIndexReader.containsValue(reader)) + { + // es ist nicht der aktuelle reader + if(isLocalPath(strIndexPathOrURL4Reader)) + { + logger.info("will close indexReader '" + strIndexPathOrURL4Reader + "'"); + reader.close(); + } + + } + else if(m_hsReader2Remove.contains(reader)) removeReaderFromCache(strIndexPathOrURL4Reader); + + } + else + m_hsIndexReader2ReaderRefCount.put(reader, iNew); + + + if(logger.isLoggable(Level.FINEST)) logger.finest("indexReader '" + strIndexPathOrURL4Reader + "' released\n" + LoggingUtils.getCurrentStackTrace()); + + + } + catch (IOException e) + { + logger.severe(ExceptionUtils.createStackTraceString(e)); + } + } + + + + synchronized static public void releaseLuceneIndexSearcher(IndexSearcher searcher) + { + releaseLuceneIndexReader(searcher.getIndexReader()); + } + + + + /** + * Removes an closes the reader object for a given index path from the cache. This is only possible in the case this object is no more in use - the method will throw + * an exception otherwise. + * + * @param strIndexPathOrURL the path to the index + * + * @throws IOException + */ + synchronized static public void removeReaderFromCache(String strIndexPathOrURL) throws IOException + { + // wir haben immer den aktuellen reader für einen index im Speicher - hier können wir ihn wieder entfernen, um den Speicher freizugeben + + // wenn der alte Reader nicht mehr benötigt wird, dann wird er geschlossen + IndexReader reader = m_hsIndexPathOrId2CurrentIndexReader.get(strIndexPathOrURL); + + if(m_hsIndexReader2ReaderRefCount.get(reader) == null) + { + logger.fine("will close indexReader '" + strIndexPathOrURL + "'"); + m_hsIndexPathOrId2CurrentIndexReader.remove(strIndexPathOrURL); + m_hsStaticIndexReaderSet.remove(reader); + + if(isLocalPath(m_hsIndexReader2IndexPath.get(reader))) reader.close(); + + m_hsReader2Remove.remove(reader); + } + else + { + throw new IllegalStateException("Cannot remove reader object for '" + strIndexPathOrURL + + "' from cache. It is still in use. Did you forget an releaseIndexReader(..) invocation?"); + + } + } + + + + /** + * Removes an closes the reader object for a given index path from the cache. This is only possible in the case this object is no more in use - otherwise, the reader + * Object will be removed from the cache immediately when it is no more in use. + * + * @param strIndexPathOrURL the path to the index + * + * @return READER_CLOSED in the case the reader was closed immediately, READER_IN_QUEUE if it is in the queue of 'to close readers' now. If the reader is not inside + * the cache, the method will return READER_NOT_IN_CACHE + * + * @throws IOException + */ + synchronized static public ReaderStatus removeReaderFromCacheWhenPossible(String strIndexPathOrURL) throws IOException + { + // wir haben immer den aktuellen reader für einen index im Speicher - hier können wir ihn wieder entfernen, um den Speicher freizugeben + + if(!isReaderInCache(strIndexPathOrURL)) return ReaderStatus.READER_NOT_IN_CACHE; + + // wenn der alte Reader nicht mehr benötigt wird, dann wird er geschlossen + IndexReader reader = m_hsIndexPathOrId2CurrentIndexReader.get(strIndexPathOrURL); + + if(m_hsIndexReader2ReaderRefCount.get(reader) == null) + { + logger.fine("will close indexReader '" + strIndexPathOrURL + "'"); + m_hsIndexPathOrId2CurrentIndexReader.remove(strIndexPathOrURL); + m_hsStaticIndexReaderSet.remove(reader); + + if(isLocalPath(m_hsIndexReader2IndexPath.get(reader))) reader.close(); + + return ReaderStatus.READER_CLOSED; + + } + else + { + m_hsReader2Remove.add(reader); + + return ReaderStatus.READER_IN_QUEUE; + } + } + + + + + // /** + // * Simply removes a formerly cached Searcher Object from the cache. Only remote Searcher proxies are cached - so this is only to give a possibility to free the memory + // * again (nevertheless, there should be not much amount of memory consumtion - in the case you have not thousands of searcher objects, you should be able to ignore + // * this...(hehe - I didn't say that ;) ) + // * + // * @param strIndexPathOrURL the index for which you want to remove the according searcher proxy object out of the internal cache + // */ + // synchronized static public void removeSearcherFromCache(String strIndexPathOrURL) + // { + // m_hsIndexPathOrURL2CurrentRemoteSearcher.remove(strIndexPathOrURL); + // } + + + + /** + * Removes and closes all cached reader objects that are not in use. This method can be used safely at any time, the only disadvantage is that an subsequent + * invocation of {@link #getLuceneIndexReader(String, boolean)} for one of these indices will take longer time. + * + * @throws IOException + */ + static public void removeUnusedReadersFromCache() throws IOException + { + LinkedList llIndexPaths = new LinkedList(); + + llIndexPaths.addAll(m_hsIndexPathOrId2CurrentIndexReader.keySet()); + + for (String strIndexPathOrURL : llIndexPaths) + try + { + removeReaderFromCache(strIndexPathOrURL); + } + catch (IllegalStateException e) + { + if(!e.getMessage().startsWith("Cannot remove reader object for")) throw e; + } + } + + + + /** + * Sets the default analyzer that will be used for writer creation + * + * @param analyzer the default analyzer that will be used for writer creation + */ + static public void setDefaultAnalyzer(Analyzer analyzer) + { + m_analyzer4writer = analyzer; + } + + + + /** + * Sets the default attribute name that will be used for RemotIndexReader creation + * + * @param strIdAttributeName the default attribute name that will be used for RemotIndexReader creation + */ + static public void setDefaultIndexIdAttribute(String strIdAttributeName) + { + IndexAccessor.m_strIdAttributeName = strIdAttributeName; + } + + + + /** + * Sets the time intervall all reader objects will be refreshed automatically. After a refresh, all Objects from subsequent calls of {@link #getLuceneIndexReader(String, boolean)} + * will reflect the current state of an index, with any changes done. + * + * @param lMillis the time intervall the reader should be refreshed + * + * @return the former time intervall + */ + static public long setReaderRefreshIntervall(long lMillis) + { + long lOld = m_lReaderRefreshIntervall; + + m_lReaderRefreshIntervall = lMillis; + + return lOld; + } + + + + protected static FSDirectory createFSDirectory(File fDirPath) throws IOException + { + // das muß man so umständlich mit setLockfactory machen - wenn man einfach initial das die erstellt, und das dir wurde mit einer anderen + // LockFactory erstellt, dann kommt ne Exception + + + // null heißt SimpleFileLock (ich hab gekuckt ;) ) + FSDirectory dir = FSDirectory.open(fDirPath.toPath()); + + // NativeFSLockFactory lockFactory = new NativeFSLockFactory(fDirPath); + // lockFactory.setLockPrefix("indexAccessor"); + // if(isNativeFileLockEnabled()) dir.setLockFactory(lockFactory); + + return dir; + } + + + + + /** + * Closes all reader and writer objects. This is mainly for the shutdown hook, to make shure that no other processes will be blocked by non-closed Objects + * + * @throws IOException + */ + protected static void forceCloseAll() throws IOException + { + if(m_hsIndexReader2ReaderRefCount.size() == 0 && m_hsIndexPathOrURL2Writer.size() == 0) return; + + logger.info("closing of all index readers and writers will be forced " + m_hsIndexReader2ReaderRefCount.size() + " reader(s), " + + m_hsIndexPathOrURL2Writer.size() + " writer(s)"); + + + for (IndexReader reader : m_hsIndexReader2ReaderRefCount.keySet()) + if(isLocalPath(m_hsIndexReader2IndexPath.get(reader))) reader.close(); + + for (Entry pathOrURL2Writer : m_hsIndexPathOrURL2Writer.entrySet()) + { + + String strPath = pathOrURL2Writer.getKey(); + IndexWriter writer = pathOrURL2Writer.getValue(); + writer.commit(); + + if(isLocalPath(strPath)) writer.close(); + } + } + + + + /** + * Gets all reader Objects that should be removed from the cache immediately when they are no more in use + * + * @return all reader Objects that should be removed from the cache immediately when they are no more in use + */ + protected static HashSet getReader2RemoveQueue() + { + return m_hsReader2Remove; + } + + + + + + + /** + * Checks whether the given URL is a local one or not. Local means that the URL starts with 'file:' or that this path exists on the local storage. + */ + protected static boolean isLocalPath(String strIndexPathOrURL) + { + if(StringUtils.nullOrWhitespace(strIndexPathOrURL)) return false; + + File fIndex = null; + // die super-URI-Implementierung nimmt echt alles an, was auch keine Uri ist, ohne eine syntaxException - insbesondere einen Pfad :( + + if(strIndexPathOrURL.startsWith("file:")) return true; + + fIndex = new File(strIndexPathOrURL); + + + if(fIndex.exists()) return true; + + + return false; + } + + +} diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/LuceneAnalyzerFactory.java b/src/main/java/de/dfki/km/leech/lucene/basic/LuceneAnalyzerFactory.java new file mode 100644 index 0000000..ce78cf9 --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/LuceneAnalyzerFactory.java @@ -0,0 +1,156 @@ +// * Created on 04.11.2005 +package de.dfki.km.leech.lucene.basic; + + + +// import de.dfki.inquisitor.lucene.DynamicFieldType; +// import de.dfki.inquisitor.lucene.FieldConfig; +import de.dfki.inquisitor.text.StringUtils; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; +import org.apache.lucene.util.Version; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.InputStreamReader; +import java.lang.reflect.Constructor; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map.Entry; +import java.util.logging.Level; +import java.util.logging.Logger; + + + + +public class LuceneAnalyzerFactory +{ + + protected static Logger m_logger = Logger.getLogger(LuceneAnalyzerFactory.class.getName()); + + + + + /** + * Creates a new Analyzer out of the given + * + * @return the according analyzer + * + * @throws Exception + */ + public static Analyzer createAnalyzer(FieldConfig fieldConfig) throws Exception + { + + String strDefaultAnalyzerName = fieldConfig.defaultFieldType.getAnalyzer(); + Analyzer defaultAnalyzer = LuceneAnalyzerFactory.createAnalyzer(strDefaultAnalyzerName, null); + + + HashMap hsFieldName2Analyzer = new HashMap(); + for (Entry fieldname2FieldType : fieldConfig.fieldName2FieldType.entrySet()) + { + String strFieldName = fieldname2FieldType.getKey(); + try + { + String strAnalyzer4Field = fieldname2FieldType.getValue().getAnalyzer(); + if(!StringUtils.nullOrWhitespace(strAnalyzer4Field)) + hsFieldName2Analyzer.put(strFieldName, LuceneAnalyzerFactory.createAnalyzer(strAnalyzer4Field, null)); + } + catch (Exception e) + { + Logger.getLogger(LuceneAnalyzerFactory.class.getName()).warning("could not create analyzer from config of field '" + strFieldName + "'"); + } + } + + return new PerFieldAnalyzerWrapper(defaultAnalyzer, hsFieldName2Analyzer); + } + + + + /** + * Creates a new Analyzer. + * + * @param analyzerClassName The class name of the Analyzer to be created. + * @param userGivenStopWordFileName The file name of the stop word file, or null or empty, if no stop words should be set. If the given file name is + * relative + * + * @return the newly created analyzer + * + * @throws Exception + */ + public static Analyzer createAnalyzer(String analyzerClassName, String userGivenStopWordFileName) throws Exception + { + try + { + Analyzer analyzer; + + Class analyzerClass = Class.forName(analyzerClassName); + if(!StringUtils.nullOrWhitespace(userGivenStopWordFileName)) + { + Class[] parameterClasses = { String[].class }; + Constructor constructor; + try + { + constructor = analyzerClass.getConstructor(parameterClasses); + + + m_logger.finer("creating Analyzer " + analyzerClassName + " with stopword file " + userGivenStopWordFileName); + InputStreamReader inReader = new InputStreamReader(new FileInputStream(userGivenStopWordFileName), "UTF-8"); + BufferedReader reader = new BufferedReader(inReader); + ArrayList wordList = new ArrayList(); + String stopWord = reader.readLine(); + while (stopWord != null) + { + wordList.add(stopWord); + stopWord = reader.readLine(); + } + reader.close(); + String[] stopWords = wordList.toArray(new String[wordList.size()]); + + + + Object[] parameters = { stopWords }; + analyzer = (Analyzer) constructor.newInstance(parameters); + } + catch (NoSuchMethodException e) + { + m_logger.warning("Analyzer '" + analyzerClassName + "' cannot be parameterized with stop word list. Specified stop word list will be ignored"); + constructor = analyzerClass.getConstructor(new Class[0]); + Object[] parameters = {}; + analyzer = (Analyzer) constructor.newInstance(parameters); + } + + } + else + { + m_logger.finer("creating Analyzer " + analyzerClassName + " without stopword file"); + + + try + { + //we try if there is a constructor with a single Version parameter + Class[] parameterClasses = { Version.class }; + Constructor constructor = analyzerClass.getConstructor(parameterClasses); + + Object[] parameters = { Version.LUCENE_CURRENT }; + analyzer = (Analyzer) constructor.newInstance(parameters); + } + catch (NoSuchMethodException e) + { + analyzer = (Analyzer) analyzerClass.newInstance(); + } + + + + } + + return analyzer; + + } + catch (Exception e) + { + m_logger.log(Level.WARNING, "Unable to instantiate Analyzer '" + analyzerClassName + "'.", e); + throw new Exception("Unable to instantiate Analyzer '" + analyzerClassName + "'.", e); + } + } + +} diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/LuceneUtilz.java b/src/main/java/de/dfki/km/leech/lucene/basic/LuceneUtilz.java new file mode 100644 index 0000000..612bacc --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/LuceneUtilz.java @@ -0,0 +1,454 @@ +package de.dfki.km.leech.lucene.basic; + + + +import de.dfki.inquisitor.collections.TwoValuesBox; +// import de.dfki.inquisitor.lucene.FieldConfig; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.*; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.*; +import org.apache.lucene.search.MultiTermQuery.RewriteMethod; +import org.apache.lucene.util.Bits; + +import java.io.IOException; +import java.util.*; + + + +public class LuceneUtilz +{ + + + + + /** + * There exists a bug in lucene (at least currently) which yields to the fact that some field attributes are gone if reading a document, which makes re-inserting this + * document to the index impossible. As workaround we reinsert all attributes with stored values again to the given document object, with the according fieldType from + * fieldConfig. + * + * @param doc the doc object that should be processed + */ + static public void reInsertStoredFieldTypes(Document doc, FieldConfig fieldConfig) + { + LinkedList llReInsertFields = new LinkedList<>(); + + Iterator itFields = doc.iterator(); + while (itFields.hasNext()) + { + IndexableField oldField = itFields.next(); + + if(!oldField.fieldType().stored()) continue; + + itFields.remove(); + + IndexableField newField; + if(oldField.fieldType().docValuesType() == DocValuesType.NUMERIC) + newField = fieldConfig.createField(oldField.name(), oldField.numericValue()); + else + newField = fieldConfig.createField(oldField.name(), oldField.stringValue()); + + llReInsertFields.add(newField); + } + + for (IndexableField newField : llReInsertFields) + doc.add(newField); + + } + + + + /** + * Extract all the terms in the index matching the query terms. Works also with wildcard queries + * + * @return the terms in the index matching the query terms. Works also with wildcard queries + */ + @SuppressWarnings("javadoc") + static public Set extractQueryTerms(String strQuery, QueryParser queryParser, IndexReader reader) + { + try + { + Query query = queryParser.parse(strQuery); + + + return extractQueryTerms(query, reader); + + } + catch (Exception e) + { + throw new RuntimeException(e); + } + } + + + + /** + * Extract all the terms in the index matching the query terms. Works also with wildcard queries + * + * @return the terms in the index matching the query terms. Works also with wildcard queries + */ + @SuppressWarnings("javadoc") + static public Set extractQueryTerms(Query query, IndexReader reader) + { + try + { + + HashSet subQueries = LuceneUtilz.getSubQueries(query); + List> llQuery2FormerRewrite = new LinkedList<>(); + + for (Query subQuery : subQueries) + { + if(subQuery instanceof MultiTermQuery) + { + llQuery2FormerRewrite.add(new TwoValuesBox((MultiTermQuery) subQuery, ((MultiTermQuery) subQuery).getRewriteMethod())); + // das brauchen wir, damit Lucene wieder die Terme in BooleanQueries reinmultipliziert (prefixQueries, etc.) + ((MultiTermQuery) subQuery).setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE); + } + } + + Query rewritten = query.rewrite(reader); + + HashSet hsTerms = new HashSet<>(); + + Weight rewrittenWeight = rewritten.createWeight(new IndexSearcher(reader), false); + rewrittenWeight.extractTerms(hsTerms); + // rewritten.extractTerms(hsTerms); + + // jetzt setzen wir die rewrite Method wieder auf das ursprüngliche zurück + for (TwoValuesBox subQuery2FormerRewrite : llQuery2FormerRewrite) + subQuery2FormerRewrite.getFirst().setRewriteMethod(subQuery2FormerRewrite.getSecond()); + + + return hsTerms; + + } + catch (Exception e) + { + throw new RuntimeException(e); + } + + } + + + + public static List analyzeText(String strFieldName, String strText, Analyzer analyzer, int iMaxResults) + { + try + { + LinkedList llTokenStrings = new LinkedList<>(); + + // wir analysieren/normalisieren den Term für den Lookup + TokenStream tokenstream = analyzer.tokenStream(strFieldName, strText); + + CharTermAttribute termAtt = tokenstream.addAttribute(CharTermAttribute.class); + tokenstream.reset(); // Resets this stream to the beginning. (Required) + + for (int i = 0; i < iMaxResults; i++) + { + + if(!tokenstream.incrementToken()) break; + + llTokenStrings.add(termAtt.toString()); + } + + tokenstream.end(); // Perform end-of-stream operations, e.g. set the final offset. + tokenstream.close(); // Release resources associated with this stream. + + + return llTokenStrings; + + } + catch (Exception e) + { + throw new RuntimeException(e); + } + } + + + + static public Bits bits4Doc(final int iDocId, final int iBitsLength) + { + return new Bits() + { + + @Override + public boolean get(int index) + { + if(index == iDocId) + return true; + else + return false; + } + + + + @Override + public int length() + { + return iBitsLength; + } + }; + } + + + + + static public Bits bits4Docs(final Set sDocIds, final int iBitsLength) + { + return new Bits() + { + + @Override + public boolean get(int index) + { + if(sDocIds.contains(index)) + return true; + else + return false; + } + + + + @Override + public int length() + { + return iBitsLength; + } + }; + } + + + + + + + /** + * This method creates a query out of given text for a specific field, with a given analyzer. The method will create a TermQuery in the case the analyzer did not + * tokenized the input text, or a PhraseQuery in the case the analyzer did. All values in the query are fully analyzed an this searchable for the given field with + * respect to the given analyzer. + * + * @return a TermQuery, PhraseQuery or null in the case there was no text left after processing the text with the analyzer + */ + public static Query createQuery(String strFieldName, String strText, Analyzer analyzer) + { + List lAnalyzedText = analyzeText(strFieldName, strText, analyzer, Integer.MAX_VALUE); + + if(lAnalyzedText.size() > 1) + { + PhraseQuery pq = new PhraseQuery(strFieldName, lAnalyzedText.toArray(new String[0])); + // for (String strTerm : lAnalyzedText) + // pq.add(new Term(strFieldName, strTerm)); + + return pq; + } + else if(lAnalyzedText.size() == 1) return new TermQuery(new Term(strFieldName, lAnalyzedText.get(0))); + + return null; + } + + + + public static List getDocsWithTerm(Term term2search, int iMaxResults, IndexSearcher indexSearcher, Set fields2load) + { + + try + { + LinkedList llDocs = new LinkedList<>(); + + TopDocs topDocs = indexSearcher.search(new TermQuery(term2search), iMaxResults); + + for (int i = 0; i < topDocs.scoreDocs.length; i++) + { + + int doc = topDocs.scoreDocs[i].doc; + + if(fields2load == null) + llDocs.add(indexSearcher.doc(doc)); + else + llDocs.add(indexSearcher.doc(doc, fields2load)); + + } + + return llDocs; + + } + catch (IOException e) + { + throw new RuntimeException(e); + } + + } + + + + /** + * Extracts all subqueries which have a boost factor of a given Query into an array + * + * @param query Query to extract subqueries from + * @return an array of the subqueries which have a boost factor + */ + public static Set getSubClauses(Query query) + { + HashSet subqueries = new HashSet(); + + getSubClauses(query, subqueries); + + + return subqueries; + } + + + + private static void getSubClauses(Query query, HashSet subClauses) + { + if(!(query instanceof BooleanQuery)) return; + + BooleanClause[] queryClauses = ((BooleanQuery) query).clauses().toArray(new BooleanClause[0]); + + for (BooleanClause clause : queryClauses) + { + subClauses.add(clause); + + if(clause.getQuery() instanceof BooleanQuery) getSubClauses(clause.getQuery(), subClauses); + } + } + + + + /** + * Extracts all subqueries of a given Query. The given query will also be part of the returned set. + * + * @param query Query to extract subqueries from + * + * @return all subqueries + */ + public static HashSet getSubQueries(Query query) + { + HashSet subqueries = new HashSet(); + getSubQueries(query, subqueries); + + return subqueries; + } + + + + protected static void getSubQueries(Query query, HashSet subQueries) + { + if(query instanceof BooleanQuery) + { + BooleanClause[] queryClauses = ((BooleanQuery) query).clauses().toArray(new BooleanClause[0]); + + for (int i = 0; i < queryClauses.length; i++) + getSubQueries(queryClauses[i].getQuery(), subQueries); + } + + subQueries.add(query); + } + + + // + // static public int getTermFrq4Doc(Term term, int iDocId, IndexReader reader) + // { + // return getTermFrq4Docs(term, bits4Doc(iDocId, reader.maxDoc()), reader); + // } + // + // + // + // static public int getTermFrq4Docs(Term term, Bits docBits, IndexReader reader) + // { + // + // try + // { + // DocsEnum docEnum = MultiFields.getTermDocsEnum(reader, docBits, term.field(), term.bytes()); + // int termFreq = 0; + // + // @SuppressWarnings("unused") + // int doc = DocsEnum.NO_MORE_DOCS; + // while ((doc = docEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS) + // { + // termFreq += docEnum.freq(); + // } + // + // + // return termFreq; + // + // } + // catch (Exception e) + // { + // throw new RuntimeException(e); + // } + // } + // + // + // + // + // + // static public int getTermFrq4Docs(Term term, Set sDocIds, IndexReader reader) + // { + // return getTermFrq4Docs(term, bits4Docs(sDocIds, reader.maxDoc()), reader); + // } + // + // + // + // + // static public int getTermFrq4Index(Term term, IndexReader reader) + // { + // return getTermFrq4Docs(term, MultiFields.getLiveDocs(reader), reader); + // } + + + + /** + * Gets the document object and the index document index/number + */ + @SuppressWarnings("javadoc") + public static TwoValuesBox getUniqueDocWithTerm(Term idTerm2search, IndexSearcher indexSearcher) + { + return getUniqueDocWithTerm(idTerm2search, indexSearcher, null); + } + + + + /** + * Gets the document object and the index document index/number + */ + @SuppressWarnings("javadoc") + public static TwoValuesBox getUniqueDocWithTerm(Term idTerm2search, IndexSearcher indexSearcher, Set fields2load) + { + + try + { + // XXX hier wollen wir einen einfachen Collecor, wir brauchen keine Scores! + TopDocs topDocs = indexSearcher.search(new TermQuery(idTerm2search), 1); + + + if(topDocs.totalHits == 0) return null; + + if(topDocs.totalHits > 1) throw new IllegalStateException("multiple document entries for ID term search"); + + + int doc = topDocs.scoreDocs[0].doc; + + Document document; + if(fields2load == null) + document = indexSearcher.doc(doc); + else + document = indexSearcher.doc(doc, fields2load); + + if(document == null) return null; + + + return new TwoValuesBox(document, doc); + + } + catch (IOException e) + { + throw new RuntimeException(e); + } + + } +} diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/PageCountEstimator.java b/src/main/java/de/dfki/km/leech/lucene/basic/PageCountEstimator.java new file mode 100644 index 0000000..7fce051 --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/PageCountEstimator.java @@ -0,0 +1,107 @@ +package de.dfki.km.leech.lucene.basic; + + + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.LegacyIntField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Terms; + +import java.io.IOException; + + + +public class PageCountEstimator +{ + + + /** + * Adds a page count attribute to a document in the case no one is there. The method estimates the page cont (i.e. 400 terms => 1 page). + * + * @param iDocNo the docs index number + * @param doc2modify the document + * @param strPageCountAttName the field for the generated page count (that should be created) + * @param isHeuristicPageCountFlagAttName an attribute name that will be generated as hint wether a document page count is calculated or not + * @param strBodyAttName the body attribute name to perform the calculation + * @param reader the lucene index reader + * + * @return true in the case the doc was modified, false otherwise + * + * @throws Exception + */ + static public boolean addHeuristicDocPageCounts(int iDocNo, Document doc2modify, String strPageCountAttName, String isHeuristicPageCountFlagAttName, + String strBodyAttName, IndexReader reader) throws Exception + { + // sofern ein Attribut noch nicht vorhanden ist, wird es hier erzeugt - mit Hilfe einer Heuristik + // es wird auch noch ein zusätzliches Attribut eingetragen, welches anzeigt, daß die PageCount mit Hilfe + // einer Heuristik erzeugt wurde + + // wenn es schon einen Eintrag für die Seitenzahlen gibt, wird das Dokument ignoriert (das war zumindest so, solange schöne Zahln im Index + // standen) + String strPageCountValue = doc2modify.get(strPageCountAttName); + // if(strPageCountValue != null) + if(strPageCountValue != null && doc2modify.get(isHeuristicPageCountFlagAttName) == null) + { + + // wenn da so ein verkrutztes Leech-Ding drin steht, dann machen wir da ne schöne Zahl draus :) + int iIndexOfKrutzel = strPageCountValue.indexOf("^^"); + if(iIndexOfKrutzel == -1) return false; + + String strPageCountValueNice = strPageCountValue.substring(0, iIndexOfKrutzel); + doc2modify.removeFields(strPageCountAttName); + + LegacyIntField field = new LegacyIntField(strPageCountAttName, Integer.parseInt(strPageCountValueNice), Store.YES); + + if(field != null) doc2modify.add(field); + + return true; + } + + // wenn es keinen Eintrag für den Content gibt, wird das Dokument ebenfalls ignoriert + String strBodyValue = doc2modify.get(strBodyAttName); + if(strBodyValue == null) return false; + + // wir haben einen Eintrag für den Body und keinen für die Seitenzahlen - also frisch ans Werk ;) + + int iPageCount = 0; + + // die Heuristik: 400 Terme ergeben eine Seite + + int iDocTermCount = getDocumentTermCount(iDocNo, strBodyAttName, reader); + + // ich sag jetzt mal einfach, daß ungefähr 400 Wörter auf einer Seite sind... + iPageCount = (iDocTermCount / 400) + 1; + + // die geschätzte PageCount + doc2modify.removeFields(strPageCountAttName); + LegacyIntField field = new LegacyIntField(strPageCountAttName, iPageCount, Store.YES); + if(field != null) doc2modify.add(field); + // ein Flag, welches anzeigt, daß dieser TermCount geschätzt wurde + doc2modify.removeFields(isHeuristicPageCountFlagAttName); + StringField newField = new StringField(isHeuristicPageCountFlagAttName, "true", Store.YES); + if(newField != null) doc2modify.add(newField); + + + return true; + } + + + + public static Integer getDocumentTermCount(int iDocNo, String strFieldName4TermCounting, IndexReader reader) throws IOException + { + + long lTermCount = 0; + + + Terms termVector = reader.getTermVector(iDocNo, strFieldName4TermCounting); + + // manchmal gibt es auch Dokumente, die keinen content bzw. keinen TermFreqVector haben.... + if(termVector != null) lTermCount = termVector.getSumTotalTermFreq(); + + + return Long.valueOf(lTermCount).intValue(); + } + +} diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/Term2FrequenciesEntry.java b/src/main/java/de/dfki/km/leech/lucene/basic/Term2FrequenciesEntry.java new file mode 100644 index 0000000..80e06b6 --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/Term2FrequenciesEntry.java @@ -0,0 +1,41 @@ +package de.dfki.km.leech.lucene.basic; + + + + + +public class Term2FrequenciesEntry +{ + + public String term; + + public Integer documentFrequency; + + public Long totalIndexFrequency; + + + + public Term2FrequenciesEntry() + { + } + + + + public Term2FrequenciesEntry(String term, Integer documentFrequency, Long totalIndexFrequency) + { + this.term = term; + this.documentFrequency = documentFrequency; + this.totalIndexFrequency = totalIndexFrequency; + + } + + + + + @Override + public String toString() + { + return "Term:" + term + " docFRQ:" + documentFrequency + " totalFRQ:" + totalIndexFrequency; + } + +} diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/Term2FrequencyEntry.java b/src/main/java/de/dfki/km/leech/lucene/basic/Term2FrequencyEntry.java new file mode 100644 index 0000000..a2ffa7a --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/Term2FrequencyEntry.java @@ -0,0 +1,65 @@ +package de.dfki.km.leech.lucene.basic; + + + + + +public class Term2FrequencyEntry +{ + + public String term; + + public Integer frequency; + + + + public Term2FrequencyEntry() + { + } + + + + public Term2FrequencyEntry(String strTerm, Integer iFrequency) + { + term = strTerm; + frequency = iFrequency; + + } + + + + public String getTerm() + { + return term; + } + + + + public void setTerm(String term) + { + this.term = term; + } + + + + public Integer getFrequency() + { + return frequency; + } + + + + public void setFrequency(Integer frequency) + { + this.frequency = frequency; + } + + + + @Override + public String toString() + { + return "Term:" + getTerm() + " FRQ:" + getFrequency(); + } + +} diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/TermPosition.java b/src/main/java/de/dfki/km/leech/lucene/basic/TermPosition.java new file mode 100644 index 0000000..eb5336e --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/TermPosition.java @@ -0,0 +1,59 @@ +package de.dfki.km.leech.lucene.basic; + + + + + +public class TermPosition +{ + + Integer m_iEndOffset; + + Integer m_iPosition; + + Integer m_iStartOffset; + + + + public Integer getEndOffset() + { + return m_iEndOffset; + } + + + + public Integer getPosition() + { + return m_iPosition; + } + + + + public Integer getStartOffset() + { + return m_iStartOffset; + } + + + + public void setEndOffset(Integer endOffset) + { + m_iEndOffset = endOffset; + } + + + + public void setPosition(Integer position) + { + m_iPosition = position; + } + + + + public void setStartOffset(Integer startOffset) + { + m_iStartOffset = startOffset; + } + + +} diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/TextWithTermVectorOffsetsField.java b/src/main/java/de/dfki/km/leech/lucene/basic/TextWithTermVectorOffsetsField.java new file mode 100644 index 0000000..f4c397c --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/TextWithTermVectorOffsetsField.java @@ -0,0 +1,48 @@ +package de.dfki.km.leech.lucene.basic; + + +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; + + + +public class TextWithTermVectorOffsetsField extends Field +{ + + + + /** + * Creates a new {@link TextWithTermVectorOffsetsField}. Default is to generate a stored field. + * + * @param name field name + * @param value String value + * @throws IllegalArgumentException if the field name or value is null. + */ + public TextWithTermVectorOffsetsField(String name, String value) + { + + super(name, value, new DynamicFieldType(TextField.TYPE_STORED).setStoreTermVectorS(true).setStoreTermVectorOffsetS(true).freezE()); + + } + + + + /** + * Creates a new {@link TextWithTermVectorOffsetsField} + * + * @param name field name + * @param value String value + * @param stored Store.YES if the content should also be stored + * @throws IllegalArgumentException if the field name or value is null. + */ + public TextWithTermVectorOffsetsField(String name, String value, Store stored) + { + + + super(name, value, new DynamicFieldType(stored == Store.YES ? TextField.TYPE_STORED : TextField.TYPE_NOT_STORED).setStoreTermVectorS(true) + .setStoreTermVectorOffsetS(true).freezE()); + + } + + +} diff --git a/src/main/java/de/dfki/km/leech/lucene/basic/URINotFoundException.java b/src/main/java/de/dfki/km/leech/lucene/basic/URINotFoundException.java new file mode 100644 index 0000000..a937b3d --- /dev/null +++ b/src/main/java/de/dfki/km/leech/lucene/basic/URINotFoundException.java @@ -0,0 +1,38 @@ +package de.dfki.km.leech.lucene.basic; + + + +public class URINotFoundException extends Exception +{ + private static final long serialVersionUID = 8317129753714055831L; + + + + public URINotFoundException() + { + super(); + } + + + + public URINotFoundException(String message, Throwable cause) + { + super(message, cause); + } + + + + public URINotFoundException(String message) + { + super(message); + } + + + + public URINotFoundException(Throwable cause) + { + super(cause); + } + + +} diff --git a/src/main/java/de/dfki/km/leech/parser/CrawlerParser.java b/src/main/java/de/dfki/km/leech/parser/CrawlerParser.java index 907633e..fad0e0e 100644 --- a/src/main/java/de/dfki/km/leech/parser/CrawlerParser.java +++ b/src/main/java/de/dfki/km/leech/parser/CrawlerParser.java @@ -31,7 +31,7 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import de.dfki.inquisition.collections.MultiValueHashMap; +import de.dfki.inquisitor.collections.MultiValueHashMap; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser; import de.dfki.km.leech.sax.DataSinkContentHandler; diff --git a/src/main/java/de/dfki/km/leech/parser/DirectoryCrawlerParser.java b/src/main/java/de/dfki/km/leech/parser/DirectoryCrawlerParser.java index 4721d53..d5fc5c9 100644 --- a/src/main/java/de/dfki/km/leech/parser/DirectoryCrawlerParser.java +++ b/src/main/java/de/dfki/km/leech/parser/DirectoryCrawlerParser.java @@ -44,7 +44,7 @@ import org.apache.tika.parser.Parser; import org.xml.sax.ContentHandler; -import de.dfki.inquisition.collections.MultiValueHashMap; +import de.dfki.inquisitor.collections.MultiValueHashMap; import de.dfki.km.leech.Leech; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.config.DirectoryCrawlerContext; diff --git a/src/main/java/de/dfki/km/leech/parser/HtmlCrawlerParser.java b/src/main/java/de/dfki/km/leech/parser/HtmlCrawlerParser.java index 5729f63..65b4da8 100644 --- a/src/main/java/de/dfki/km/leech/parser/HtmlCrawlerParser.java +++ b/src/main/java/de/dfki/km/leech/parser/HtmlCrawlerParser.java @@ -44,9 +44,9 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import de.dfki.inquisition.collections.MultiValueHashMap; -import de.dfki.inquisition.processes.StopWatch; -import de.dfki.inquisition.text.StringUtils; +import de.dfki.inquisitor.collections.MultiValueHashMap; +import de.dfki.inquisitor.processes.StopWatch; +import de.dfki.inquisitor.text.StringUtils; import de.dfki.km.leech.Leech; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.config.HtmlCrawlerContext; diff --git a/src/main/java/de/dfki/km/leech/parser/ImapCrawlerParser.java b/src/main/java/de/dfki/km/leech/parser/ImapCrawlerParser.java index f54c5ac..e5fedd4 100644 --- a/src/main/java/de/dfki/km/leech/parser/ImapCrawlerParser.java +++ b/src/main/java/de/dfki/km/leech/parser/ImapCrawlerParser.java @@ -55,8 +55,8 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import de.dfki.inquisition.collections.MultiValueHashMap; -import de.dfki.inquisition.text.StringUtils; +import de.dfki.inquisitor.collections.MultiValueHashMap; +import de.dfki.inquisitor.text.StringUtils; import de.dfki.km.leech.Leech; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.config.ImapCrawlerContext; diff --git a/src/main/java/de/dfki/km/leech/parser/NonRecursiveCrawlerParser.java b/src/main/java/de/dfki/km/leech/parser/NonRecursiveCrawlerParser.java index 816b8a8..747ee66 100644 --- a/src/main/java/de/dfki/km/leech/parser/NonRecursiveCrawlerParser.java +++ b/src/main/java/de/dfki/km/leech/parser/NonRecursiveCrawlerParser.java @@ -9,7 +9,7 @@ import org.apache.tika.parser.ParseContext; import org.xml.sax.ContentHandler; -import de.dfki.inquisition.collections.MultiValueHashMap; +import de.dfki.inquisitor.collections.MultiValueHashMap; import de.dfki.km.leech.SubDataEntityContentHandler; import de.dfki.km.leech.metadata.LeechMetadata; diff --git a/src/main/java/de/dfki/km/leech/parser/UrlListCrawlerParser.java b/src/main/java/de/dfki/km/leech/parser/UrlListCrawlerParser.java index 8e60d1c..f5d9d8c 100644 --- a/src/main/java/de/dfki/km/leech/parser/UrlListCrawlerParser.java +++ b/src/main/java/de/dfki/km/leech/parser/UrlListCrawlerParser.java @@ -18,7 +18,7 @@ import org.apache.tika.parser.Parser; import org.xml.sax.ContentHandler; -import de.dfki.inquisition.collections.MultiValueHashMap; +import de.dfki.inquisitor.collections.MultiValueHashMap; import de.dfki.km.leech.Leech; import de.dfki.km.leech.io.URLStreamProvider; diff --git a/src/main/java/de/dfki/km/leech/parser/incremental/IncrementalCrawlingHistory.java b/src/main/java/de/dfki/km/leech/parser/incremental/IncrementalCrawlingHistory.java index f3010da..7199b28 100644 --- a/src/main/java/de/dfki/km/leech/parser/incremental/IncrementalCrawlingHistory.java +++ b/src/main/java/de/dfki/km/leech/parser/incremental/IncrementalCrawlingHistory.java @@ -45,7 +45,7 @@ import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Bits; -import de.dfki.inquisition.text.StringUtils; +import de.dfki.inquisitor.text.StringUtils; import de.dfki.km.leech.config.CrawlerContext; diff --git a/src/main/java/de/dfki/km/leech/parser/wikipedia/WikipediaDumpParser.java b/src/main/java/de/dfki/km/leech/parser/wikipedia/WikipediaDumpParser.java index 87cbf39..adbf456 100644 --- a/src/main/java/de/dfki/km/leech/parser/wikipedia/WikipediaDumpParser.java +++ b/src/main/java/de/dfki/km/leech/parser/wikipedia/WikipediaDumpParser.java @@ -54,9 +54,9 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import de.dfki.inquisition.collections.MultiValueBalancedTreeMap; -import de.dfki.inquisition.collections.MultiValueHashMap; -import de.dfki.inquisition.text.StringUtils; +import de.dfki.inquisitor.collections.MultiValueBalancedTreeMap; +import de.dfki.inquisitor.collections.MultiValueHashMap; +import de.dfki.inquisitor.text.StringUtils; import de.dfki.km.leech.metadata.LeechMetadata; import de.dfki.km.leech.util.TikaUtils; @@ -658,7 +658,7 @@ protected void parseGeoCoordinates(String strText, Metadata metadata) - protected void parseInfoBox(String strText, Metadata metadata, ContentHandler handler) throws SAXException + protected void parseInfoBox(String strText, Metadata metadata, ContentHandler handler) throws SAXException, IOException { // att-value paare mit | getrennt. Innerhalb eines values gibt es auch Zeilenumbrüche (mit '
    ') - dies gilt als Aufzählung @@ -673,7 +673,7 @@ protected void parseInfoBox(String strText, Metadata metadata, ContentHandler ha // als erstes schneiden wir mal die Infobox raus. (?m) ist multiline und (?s) ist dotall ('.' matcht auch line breaks) int iStartInfoBox = -1; int iEndInfoBox = -1; - MatchResult infoMatch = StringUtils.findFirst("\\{\\{\\s*Infobox", strText); + MatchResult infoMatch = StringUtils.findFirstMatch("\\{\\{\\s*Infobox", strText); if(infoMatch != null) { iStartInfoBox = infoMatch.start(); diff --git a/src/main/java/de/dfki/km/leech/sax/CrawlReportContentHandler.java b/src/main/java/de/dfki/km/leech/sax/CrawlReportContentHandler.java index 8d36c17..b64ffd5 100644 --- a/src/main/java/de/dfki/km/leech/sax/CrawlReportContentHandler.java +++ b/src/main/java/de/dfki/km/leech/sax/CrawlReportContentHandler.java @@ -28,10 +28,10 @@ import org.apache.tika.metadata.Metadata; -import de.dfki.inquisition.collections.CollectionUtilz; -import de.dfki.inquisition.collections.MultiValueTreeMap; -import de.dfki.inquisition.processes.StopWatch; -import de.dfki.inquisition.text.StringUtils; +import de.dfki.inquisitor.collections.CollectionUtilz; +import de.dfki.inquisitor.collections.MultiValueTreeMap; +import de.dfki.inquisitor.processes.StopWatch; +import de.dfki.inquisitor.text.StringUtils; diff --git a/src/main/java/de/dfki/km/leech/solr/ToSolrContentHandler.java b/src/main/java/de/dfki/km/leech/solr/ToSolrContentHandler.java index 77b8070..c5da910 100644 --- a/src/main/java/de/dfki/km/leech/solr/ToSolrContentHandler.java +++ b/src/main/java/de/dfki/km/leech/solr/ToSolrContentHandler.java @@ -14,8 +14,8 @@ import org.apache.solr.common.SolrInputDocument; import org.apache.tika.metadata.Metadata; -import de.dfki.inquisition.collections.MultiValueHashMap; -import de.dfki.inquisition.text.StringUtils; +import de.dfki.inquisitor.collections.MultiValueHashMap; +import de.dfki.inquisitor.text.StringUtils; import de.dfki.km.leech.metadata.LeechMetadata; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; import de.dfki.km.leech.sax.DataSinkContentHandler; diff --git a/src/main/java/de/dfki/km/leech/util/IndexPostprocessor.java b/src/main/java/de/dfki/km/leech/util/IndexPostprocessor.java index fdd4d0d..3eb31f6 100644 --- a/src/main/java/de/dfki/km/leech/util/IndexPostprocessor.java +++ b/src/main/java/de/dfki/km/leech/util/IndexPostprocessor.java @@ -14,6 +14,7 @@ import java.util.Set; import java.util.logging.Logger; +import de.dfki.km.leech.lucene.basic.*; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.*; @@ -26,14 +27,14 @@ import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.tika.metadata.Metadata; -import de.dfki.inquisition.file.FileUtils; -import de.dfki.inquisition.lucene.Buzzwords; -import de.dfki.inquisition.lucene.DocumentFrqClass; -import de.dfki.inquisition.lucene.FieldConfig; -import de.dfki.inquisition.lucene.LuceneUtilz; -import de.dfki.inquisition.lucene.PageCountEstimator; -import de.dfki.inquisition.processes.StopWatch; -import de.dfki.inquisition.text.StringUtils; +import de.dfki.inquisitor.file.FileUtilz; +// import de.dfki.inquisitor.lucene.Buzzwords; +// import de.dfki.inquisitor.lucene.DocumentFrqClass; +// import de.dfki.inquisitor.lucene.FieldConfig; +// import de.dfki.inquisitor.lucene.LuceneUtilz; +// import de.dfki.inquisitor.lucene.PageCountEstimator; +import de.dfki.inquisitor.processes.StopWatch; +import de.dfki.inquisitor.text.StringUtils; import de.dfki.km.leech.lucene.ToLuceneContentHandler; import de.dfki.km.leech.metadata.LeechMetadata; @@ -90,11 +91,6 @@ static protected List terms(String strFieldName, String strPrefix, int i /** * Enables the Buzzword creation by setting the related configuration parameters. - * - * @param strNewField4Buzzwords - * @param sAttNames4BuzzwordCalculation - * @param iMaxNumberOfBuzzwords - * @param bSkipSimilarTerms */ public void enableBuzzwordGeneration(String strNewField4Buzzwords, int iMaxNumberOfBuzzwords, boolean bSkipSimilarTerms) { @@ -284,8 +280,8 @@ public void postprocessIndex(String strLuceneIndexPath, FieldConfig fieldConfig, } // fOurTmpDir.renameTo(fLuceneIndex); - FileUtils.deleteDirectory(new File(pUnpostProcessed.toString())); - FileUtils.deleteDirectory(fOurTmpDir.toFile()); + FileUtilz.deleteDirectory(new File(pUnpostProcessed.toString())); + FileUtilz.deleteDirectory(fOurTmpDir.toFile()); diff --git a/src/main/java/de/dfki/km/leech/util/LuceneIndexCreator.java b/src/main/java/de/dfki/km/leech/util/LuceneIndexCreator.java index 9af0f33..1025ece 100644 --- a/src/main/java/de/dfki/km/leech/util/LuceneIndexCreator.java +++ b/src/main/java/de/dfki/km/leech/util/LuceneIndexCreator.java @@ -13,6 +13,7 @@ import java.util.logging.Level; import java.util.logging.Logger; +import de.dfki.km.leech.lucene.basic.FieldConfig; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; @@ -23,10 +24,10 @@ import org.apache.tika.parser.ParseContext; import org.xml.sax.SAXException; -import de.dfki.inquisition.collections.MultiValueHashMap; -import de.dfki.inquisition.lucene.FieldConfig; -import de.dfki.inquisition.processes.StopWatch; -import de.dfki.inquisition.text.StringUtils; +import de.dfki.inquisitor.collections.MultiValueHashMap; +// import de.dfki.inquisitor.lucene.FieldConfig; +import de.dfki.inquisitor.processes.StopWatch; +import de.dfki.inquisitor.text.StringUtils; import de.dfki.km.leech.Leech; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.lucene.LeechDefaultFieldConfig; @@ -40,8 +41,7 @@ /** - * A very simple Lucene Index creator. FieldConfig is from {@link WikipediaDumpParser#getFieldConfig4ParserAttributes()}, currently you can only specify the source - * dir/file and the target dir for the lucene index + * A very simple Lucene Index creator. Currently you can only specify the source dir/file and the target dir for the lucene index * * @author Christian Reuschling, Dipl.Ing.(BA) * @@ -97,7 +97,7 @@ public static void createIndex(List lUrls2Crawl, String strLuceneIndexPa Leech leech = new Leech(); - long startTime = StopWatch.startAndLogTime(Level.INFO); + long startTime = StopWatch.startAndLogTime(LuceneIndexCreator.class); CrawlReportContentHandler reportContentHandler; @@ -145,7 +145,7 @@ public static void createIndex(List lUrls2Crawl, String strLuceneIndexPa indexWriter.forceMerge(1, true); indexWriter.close(); - StopWatch.stopAndLogDistance(startTime, Level.INFO); + StopWatch.stopAndLogDistance(startTime, LuceneIndexCreator.class); Logger.getLogger(LuceneIndexCreator.class.getName()).info("..finished crawling " + lUrls2Crawl); } diff --git a/src/main/java/de/dfki/km/leech/util/SolrIndexCreator.java b/src/main/java/de/dfki/km/leech/util/SolrIndexCreator.java index dc76d4b..8ae1cf2 100644 --- a/src/main/java/de/dfki/km/leech/util/SolrIndexCreator.java +++ b/src/main/java/de/dfki/km/leech/util/SolrIndexCreator.java @@ -13,8 +13,8 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import de.dfki.inquisition.collections.MultiValueHashMap; -import de.dfki.inquisition.processes.StopWatch; +import de.dfki.inquisitor.collections.MultiValueHashMap; +import de.dfki.inquisitor.processes.StopWatch; import de.dfki.km.leech.Leech; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.WikipediaDumpParserConfig; @@ -76,7 +76,7 @@ public void createIndex(List lUrls2Crawl, String strSolrUrl, MultiValueH Leech leech = new Leech(); - long startTime = StopWatch.startAndLogTime(Level.INFO); + long startTime = StopWatch.startAndLogTime(SolrIndexCreator.class); CrawlReportContentHandler reportContentHandler; @@ -123,7 +123,7 @@ public void createIndex(List lUrls2Crawl, String strSolrUrl, MultiValueH leech.parse(lUrls2Crawl.toArray(new String[0]), finalContentHandler, context); - StopWatch.stopAndLogDistance(startTime, Level.INFO); + StopWatch.stopAndLogDistance(startTime, SolrIndexCreator.class); } diff --git a/src/main/java/de/dfki/km/leech/util/TikaUtils.java b/src/main/java/de/dfki/km/leech/util/TikaUtils.java index 0ff3145..9e2f340 100644 --- a/src/main/java/de/dfki/km/leech/util/TikaUtils.java +++ b/src/main/java/de/dfki/km/leech/util/TikaUtils.java @@ -29,7 +29,7 @@ import org.apache.tika.parser.Parser; import org.xml.sax.ContentHandler; -import de.dfki.inquisition.text.StringUtils; +import de.dfki.inquisitor.text.StringUtils; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.parser.DirectoryCrawlerParser;