diff --git a/src/search.cpp b/src/search.cpp index cac35853..72e4899d 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -22,6 +22,7 @@ * */ +#include #include #include #include @@ -77,51 +78,57 @@ InternalDataBase::InternalDataBase(const std::vector& archives, bool ve continue; } - if ( first ) { - m_valuesmap = read_valuesmap(database.get_metadata("valuesmap")); - auto language = database.get_metadata("language"); - if (language.empty() ) { - // Database created before 2017/03 has no language metadata. - // However, term were stemmed anyway and we need to stem our - // search query the same the database was created. - // So we need a language, let's use the one of the zim. - // If zimfile has no language metadata, we can't do lot more here :/ - try { - language = archive.getMetadata("Language"); - } catch(...) {} - } - if (!language.empty()) { - icu::Locale languageLocale(language.c_str()); - /* Configuring language base steemming */ - try { - m_stemmer = Xapian::Stem(languageLocale.getLanguage()); - m_queryParser.set_stemmer(m_stemmer); - m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL); - } catch (...) { - std::cout << "No stemming for language '" << languageLocale.getLanguage() << "'" << std::endl; + try { + if ( first ) { + m_valuesmap = read_valuesmap(database.get_metadata("valuesmap")); + auto language = database.get_metadata("language"); + if (language.empty() ) { + // Database created before 2017/03 has no language metadata. + // However, term were stemmed anyway and we need to stem our + // search query the same the database was created. + // So we need a language, let's use the one of the zim. + // If zimfile has no language metadata, we can't do lot more here :/ + try { + language = archive.getMetadata("Language"); + } catch(...) {} } - } - auto stopwords = database.get_metadata("stopwords"); - if ( !stopwords.empty() ){ - std::string stopWord; - std::istringstream file(stopwords); - Xapian::SimpleStopper* stopper = new Xapian::SimpleStopper(); - while (std::getline(file, stopWord, '\n')) { - stopper->add(stopWord); + if (!language.empty()) { + icu::Locale languageLocale(language.c_str()); + /* Configuring language base steemming */ + try { + m_stemmer = Xapian::Stem(languageLocale.getLanguage()); + m_queryParser.set_stemmer(m_stemmer); + m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL); + } catch (...) { + std::cout << "No stemming for language '" << languageLocale.getLanguage() << "'" << std::endl; + } + } + auto stopwords = database.get_metadata("stopwords"); + if ( !stopwords.empty() ){ + std::string stopWord; + std::istringstream file(stopwords); + Xapian::SimpleStopper* stopper = new Xapian::SimpleStopper(); + while (std::getline(file, stopWord, '\n')) { + stopper->add(stopWord); + } + stopper->release(); + m_queryParser.set_stopper(stopper); + } + } else { + std::map valuesmap = read_valuesmap(database.get_metadata("valuesmap")); + if (m_valuesmap != valuesmap ) { + // [TODO] Ignore the database, raise a error ? } - stopper->release(); - m_queryParser.set_stopper(stopper); - } - } else { - std::map valuesmap = read_valuesmap(database.get_metadata("valuesmap")); - if (m_valuesmap != valuesmap ) { - // [TODO] Ignore the database, raise a error ? } + m_xapianDatabases.push_back(database); + m_database.add_database(database); + m_archives.push_back(archive); + first = false; + } catch( Xapian::DatabaseError& e ) { + // [TODO] Ignore the database or raise a error ? + // As we already ignore the database if `getDbFromAccessInfo` "detects" a DatabaseError, + // we also ignore here. } - m_xapianDatabases.push_back(database); - m_database.add_database(database); - m_archives.push_back(archive); - first = false; } } @@ -278,6 +285,8 @@ int Search::getEstimatedMatches() const return mset.get_matches_estimated(); } catch(Xapian::QueryParserError& e) { return 0; + } catch(Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); } } @@ -288,6 +297,8 @@ const SearchResultSet Search::getResults(int start, int maxResults) const { return SearchResultSet(mp_internalDb, std::move(mset)); } catch(Xapian::QueryParserError& e) { return SearchResultSet(mp_internalDb); + } catch(Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); } } @@ -325,7 +336,11 @@ int SearchResultSet::size() const if (! mp_mset) { return 0; } - return mp_mset->size(); + try { + return mp_mset->size(); + } catch(Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); + } } SearchResultSet::iterator SearchResultSet::begin() const @@ -333,7 +348,11 @@ SearchResultSet::iterator SearchResultSet::begin() const if ( ! mp_mset ) { return nullptr; } - return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->begin()); + try { + return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->begin()); + } catch(Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); + } } SearchResultSet::iterator SearchResultSet::end() const @@ -341,7 +360,11 @@ SearchResultSet::iterator SearchResultSet::end() const if ( ! mp_mset ) { return nullptr; } - return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->end()); + try { + return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->end()); + } catch(Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); + } } } //namespace zim diff --git a/src/search_internal.h b/src/search_internal.h index 781ab820..ff87674e 100644 --- a/src/search_internal.h +++ b/src/search_internal.h @@ -108,25 +108,37 @@ struct SearchIterator::InternalData { {}; Xapian::Document get_document() { - if ( !document_fetched ) { - _document = iterator().get_document(); - document_fetched = true; + try { + if ( !document_fetched ) { + _document = iterator().get_document(); + document_fetched = true; + } + return _document; + } catch ( Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); } - return _document; } int get_databasenumber() { - Xapian::docid docid = *iterator(); - return (docid - 1) % mp_internalDb->m_archives.size(); + try { + Xapian::docid docid = *iterator(); + return (docid - 1) % mp_internalDb->m_archives.size(); + } catch ( Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); + } } Entry& get_entry() { - if ( !_entry ) { - int databasenumber = get_databasenumber(); - auto archive = mp_internalDb->m_archives.at(databasenumber); - _entry.reset(new Entry(archive.getEntryByPath(get_document().get_data()))); - } + try { + if ( !_entry ) { + int databasenumber = get_databasenumber(); + auto archive = mp_internalDb->m_archives.at(databasenumber); + _entry.reset(new Entry(archive.getEntryByPath(get_document().get_data()))); + } return *_entry.get(); + } catch ( Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); + } } bool operator==(const InternalData& other) const { diff --git a/src/search_iterator.cpp b/src/search_iterator.cpp index 1403bcda..252180d9 100644 --- a/src/search_iterator.cpp +++ b/src/search_iterator.cpp @@ -18,6 +18,7 @@ * */ +#include #define ZIM_PRIVATE #include "xapian/myhtmlparse.h" @@ -106,20 +107,24 @@ std::string SearchIterator::getPath() const { return ""; } - std::string path = internal->get_document().get_data(); - bool hasNewNamespaceScheme = internal->mp_internalDb->m_archives.at(getFileIndex()).hasNewNamespaceScheme(); + try { + std::string path = internal->get_document().get_data(); + bool hasNewNamespaceScheme = internal->mp_internalDb->m_archives.at(getFileIndex()).hasNewNamespaceScheme(); - std::string dbDataType = internal->mp_internalDb->m_database.get_metadata("data"); - if (dbDataType.empty()) { - dbDataType = "fullPath"; - } + std::string dbDataType = internal->mp_internalDb->m_database.get_metadata("data"); + if (dbDataType.empty()) { + dbDataType = "fullPath"; + } - // If the archive has new namespace scheme and the type of its indexed data - // is `fullPath` we return only the `path` without namespace - if (hasNewNamespaceScheme && dbDataType == "fullPath") { - path = path.substr(2); + // If the archive has new namespace scheme and the type of its indexed data + // is `fullPath` we return only the `path` without namespace + if (hasNewNamespaceScheme && dbDataType == "fullPath") { + path = path.substr(2); + } + return path; + } catch (Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); } - return path; } std::string SearchIterator::getDbData() const { @@ -149,37 +154,41 @@ std::string SearchIterator::getSnippet() const { return ""; } - // Generate full text snippet - if ( ! internal->mp_internalDb->hasValuesmap() ) - { - /* This is the old legacy version. Guess and try */ - std::string stored_snippet = internal->get_document().get_value(1); - if ( ! stored_snippet.empty() ) - return stored_snippet; - /* Let's continue here, and see if we can genenate one */ - } - else if ( internal->mp_internalDb->hasValue("snippet") ) - { - return internal->get_document().get_value(internal->mp_internalDb->valueSlot("snippet")); - } - - Entry& entry = internal->get_entry(); - /* No reader, no snippet */ try { - /* Get the content of the item to generate a snippet. - We parse it and use the html dump to avoid remove html tags in the - content and be able to nicely cut the text at random place. */ - zim::MyHtmlParser htmlParser; - std::string content = entry.getItem().getData(); + // Generate full text snippet + if ( ! internal->mp_internalDb->hasValuesmap() ) + { + /* This is the old legacy version. Guess and try */ + std::string stored_snippet = internal->get_document().get_value(1); + if ( ! stored_snippet.empty() ) + return stored_snippet; + /* Let's continue here, and see if we can genenate one */ + } + else if ( internal->mp_internalDb->hasValue("snippet") ) + { + return internal->get_document().get_value(internal->mp_internalDb->valueSlot("snippet")); + } + + Entry& entry = internal->get_entry(); + /* No reader, no snippet */ try { - htmlParser.parse_html(content, "UTF-8", true); - } catch (...) {} - return internal->mp_mset->snippet(htmlParser.dump, - /*length=*/500, - /*stemmer=*/internal->mp_internalDb->m_stemmer, - /*flags=*/0); - } catch (...) { - return ""; + /* Get the content of the item to generate a snippet. + We parse it and use the html dump to avoid remove html tags in the + content and be able to nicely cut the text at random place. */ + zim::MyHtmlParser htmlParser; + std::string content = entry.getItem().getData(); + try { + htmlParser.parse_html(content, "UTF-8", true); + } catch (...) {} + return internal->mp_mset->snippet(htmlParser.dump, + /*length=*/500, + /*stemmer=*/internal->mp_internalDb->m_stemmer, + /*flags=*/0); + } catch (...) { + return ""; + } + } catch (Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); } } @@ -187,20 +196,24 @@ int SearchIterator::getSize() const { return -1; } -int SearchIterator::getWordCount() const { +int SearchIterator::getWordCount() const { if ( ! internal ) { return -1; } - if ( ! internal->mp_internalDb->hasValuesmap() ) - { - /* This is the old legacy version. Guess and try */ - return internal->get_document().get_value(3).empty() == true ? -1 : atoi(internal->get_document().get_value(3).c_str()); - } - else if ( internal->mp_internalDb->hasValue("wordcount") ) - { - return atoi(internal->get_document().get_value(internal->mp_internalDb->valueSlot("wordcount")).c_str()); + try { + if ( ! internal->mp_internalDb->hasValuesmap() ) + { + /* This is the old legacy version. Guess and try */ + return internal->get_document().get_value(3).empty() == true ? -1 : atoi(internal->get_document().get_value(3).c_str()); + } + else if ( internal->mp_internalDb->hasValue("wordcount") ) + { + return atoi(internal->get_document().get_value(internal->mp_internalDb->valueSlot("wordcount")).c_str()); + } + return -1; + } catch (Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); } - return -1; } int SearchIterator::getFileIndex() const { @@ -214,7 +227,11 @@ Uuid SearchIterator::getZimId() const { if (! internal ) { throw std::runtime_error("Cannot get zimId from uninitialized iterator"); } - return internal->mp_internalDb->m_archives.at(getFileIndex()).getUuid(); + try { + return internal->mp_internalDb->m_archives.at(getFileIndex()).getUuid(); + } catch (Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); + } } SearchIterator::reference SearchIterator::operator*() const { diff --git a/src/suggestion.cpp b/src/suggestion.cpp index 33305c66..940b6049 100644 --- a/src/suggestion.cpp +++ b/src/suggestion.cpp @@ -17,6 +17,7 @@ * */ +#include #define ZIM_PRIVATE #include @@ -39,7 +40,11 @@ SuggestionDataBase::SuggestionDataBase(const Archive& archive, bool verbose) { // Initialize Xapian DB if it is enabled #if defined(ENABLE_XAPIAN) - initXapianDb(); + try { + initXapianDb(); + } catch ( Xapian::DatabaseError& e) { + throw zim::ZimFileFormatError(e.get_description()); + } #endif // ENABLE_XAPIAN } @@ -190,7 +195,7 @@ void SuggestionSearcher::setVerbose(bool verbose) void SuggestionSearcher::initDatabase() { - mp_internalDb = std::make_shared(m_archive, m_verbose); + mp_internalDb = std::make_shared(m_archive, m_verbose); } SuggestionSearch::SuggestionSearch(std::shared_ptr p_internalDb, const std::string& query) diff --git a/src/suggestion_iterator.cpp b/src/suggestion_iterator.cpp index 4a2be504..49d675aa 100644 --- a/src/suggestion_iterator.cpp +++ b/src/suggestion_iterator.cpp @@ -17,6 +17,7 @@ * */ +#include #define ZIM_PRIVATE #include "zim/suggestion_iterator.h" @@ -141,7 +142,11 @@ SuggestionIterator SuggestionIterator::operator--(int) { Entry SuggestionIterator::getEntry() const { #if defined(LIBZIM_WITH_XAPIAN) if (mp_internal) { - return mp_internal->get_entry(); + try { + return mp_internal->get_entry(); + } catch ( Xapian::DatabaseError& e) { + throw ZimFileFormatError(e.get_description()); + } } #endif // LIBZIM_WITH_XAPIAN @@ -157,7 +162,11 @@ std::string SuggestionIterator::getDbData() const { return ""; } - return mp_internal->get_document().get_data(); + try { + return mp_internal->get_document().get_data(); + } catch ( Xapian::DatabaseError& e) { + throw ZimFileFormatError(e.get_description()); + } } std::string SuggestionIterator::getIndexPath() const @@ -166,20 +175,24 @@ std::string SuggestionIterator::getIndexPath() const return ""; } - std::string path = mp_internal->get_document().get_data(); - bool hasNewNamespaceScheme = mp_internal->mp_internalDb->m_archive.hasNewNamespaceScheme(); - - std::string dbDataType = mp_internal->mp_internalDb->m_database.get_metadata("data"); - if (dbDataType.empty()) { - dbDataType = "fullPath"; - } - - // If the archive has new namespace scheme and the type of its indexed data - // is `fullPath` we return only the `path` without namespace - if (hasNewNamespaceScheme && dbDataType == "fullPath") { - path = path.substr(2); + try { + std::string path = mp_internal->get_document().get_data(); + bool hasNewNamespaceScheme = mp_internal->mp_internalDb->m_archive.hasNewNamespaceScheme(); + + std::string dbDataType = mp_internal->mp_internalDb->m_database.get_metadata("data"); + if (dbDataType.empty()) { + dbDataType = "fullPath"; + } + + // If the archive has new namespace scheme and the type of its indexed data + // is `fullPath` we return only the `path` without namespace + if (hasNewNamespaceScheme && dbDataType == "fullPath") { + path = path.substr(2); + } + return path; + } catch ( Xapian::DatabaseError& e) { + throw ZimFileFormatError(e.get_description()); } - return path; } std::string SuggestionIterator::getIndexTitle() const {