Skip to content

Commit

Permalink
feat: Tantivy_search_engine (#222)
Browse files Browse the repository at this point in the history
* create refs database

* splitted data layer

* split data layer

* fix find ref screen

* comment helper

* added mimir integration

* moved ref find to isolate

* added refs indexing screen

* fixed keyboard shortcuts

* added book filtering for full text search and isar Line model

* fixed filter_list package

* filter books fotr seearch by category

* legacy FT search option

* ability to stop indexing

* save indexed books

* adjustments to searchEngine package

* using searcStream

* fixy

* hiding create index button while indexing

* fixed

* tested and working

* switched to stream

* added listener

* few improvements

* fixed a bug

* sort entities in book tree and reorgenised some code files.

* Merge branch 'main' into tantivy_search_engine
  • Loading branch information
Sivan22 authored Sep 1, 2024
1 parent de53911 commit dfd19c0
Show file tree
Hide file tree
Showing 82 changed files with 4,434 additions and 2,283 deletions.
50 changes: 0 additions & 50 deletions lib/data/data.dart

This file was deleted.

File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@ import 'dart:isolate';
import 'dart:convert';
import 'package:csv/csv.dart';
import 'package:flutter/services.dart';
import 'package:otzaria/data/cache_provider.dart';
import 'package:otzaria/data/data_providers/cache_provider.dart';
import 'package:otzaria/utils/docx_to_otzaria.dart';
import 'package:flutter_settings_screens/flutter_settings_screens.dart';
import 'package:otzaria/utils/text_manipulation.dart';
import 'package:otzaria/models/books.dart';
import 'package:otzaria/data/data.dart';
import 'package:otzaria/models/library.dart';
import 'package:otzaria/models/links.dart';

Expand All @@ -24,9 +23,10 @@ import 'package:otzaria/models/links.dart';
/// The inner representation of the library is a tree of directories and files,
/// which every book is stored in a file, and every directory is represents a category.
/// The metadata is stored in a JSON file.
class FileSystemData extends Data {
class FileSystemData {
late String libraryPath;
Map<String, String> titleToPath = {};
Map<String, dynamic> metadata = {};

FileSystemData() {
_initialize();
Expand All @@ -42,8 +42,6 @@ class FileSystemData extends Data {
_updateTitleToPath();
}

@override

/// Returns the library
Future<Library> getLibrary() async {
return _getLibraryFromDirectory(
Expand Down Expand Up @@ -101,6 +99,8 @@ class FileSystemData extends Data {
}
}
}
category.subCategories.sort((a, b) => a.order.compareTo(b.order));
category.books.sort((a, b) => a.order.compareTo(b.order));
return category;
}

Expand All @@ -114,15 +114,14 @@ class FileSystemData extends Data {
Directory(entity.path), library));
}
}
library.subCategories.sort((a, b) => a.order.compareTo(b.order));
return library;
}

@override
Future<List<ExternalBook>> getOtzarBooks() {
return _getOtzarBooks();
}

@override
Future<List<ExternalBook>> getHebrewBooks() {
return _getHebrewBooks();
}
Expand Down Expand Up @@ -214,7 +213,6 @@ class FileSystemData extends Data {

///the implementation of the links from app's model, based on the filesystem.
///the links are in the folder 'links' with the name '<book_title>_links.json'
@override
Future<List<Link>> getAllLinksForBook(String title) async {
try {
File file = File(_getLinksPath(title));
Expand All @@ -227,8 +225,6 @@ class FileSystemData extends Data {
}
}

@override

/// Retrieves the text for a book with the given title asynchronously (using Isolate).
/// supports docx files
Future<String> getBookText(String title) {
Expand All @@ -244,8 +240,6 @@ class FileSystemData extends Data {
});
}

@override

/// an file system approach to get the content of a link.
/// we read the file line by line and return the content of the line with the given index.
Future<String> getLinkContent(Link link) async {
Expand All @@ -266,7 +260,7 @@ class FileSystemData extends Data {
/// Returns the title of the book with the given path.
// Retrieves the table of contents for a book with the given title.
@override

Future<List<TocEntry>> getBookToc(String title) async {
return _parseToc(getBookText(title));
}
Expand Down
File renamed without changes.
210 changes: 210 additions & 0 deletions lib/data/data_providers/isar_data_provider.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
import 'dart:isolate';

import 'package:flutter/foundation.dart';
import 'package:flutter_settings_screens/flutter_settings_screens.dart';
import 'package:isar/isar.dart';
import 'package:otzaria/models/books.dart';
import 'package:otzaria/models/isar_collections/line.dart';
import 'package:otzaria/models/isar_collections/ref.dart';
import 'package:fuzzywuzzy/fuzzywuzzy.dart';
import 'package:otzaria/models/library.dart';
import 'package:pdfrx/pdfrx.dart';

class IsarDataProvider {
static final IsarDataProvider _singleton = IsarDataProvider();
static IsarDataProvider get instance => _singleton;

IsarDataProvider();

final isar = Isar.open(
directory: Settings.getValue<String>('key-library-path') ?? 'C:\\אוצריא',
maxSizeMiB: 100000,
schemas: [
RefSchema,
LineSchema,
],
);
ValueNotifier<int?> refsNumOfbooksDone = ValueNotifier(null);
ValueNotifier<int?> refsNumOfbooksTotal = ValueNotifier(null);
ValueNotifier<int?> linesNumOfbooksDone = ValueNotifier(null);
ValueNotifier<int?> linesNumOfbooksTotal = ValueNotifier(null);

Future<void> createRefsFromLibrary(Library library, int startIndex) async {
isar.write((isar) => isar.refs.clear());
int i = 0;
final allBooks =
library.getAllBooks().whereType<TextBook>().skip(startIndex);
refsNumOfbooksTotal.value = allBooks.length;
for (TextBook book in allBooks) {
try {
print('Creating refs for ${book.title} (${i++}/${allBooks.length})');
refsNumOfbooksDone.value = i - 1;
List<Ref> refs = [];
final List<TocEntry> toc = await book.tableOfContents;
//get all TocEntries recursively
List<TocEntry> alltocs = [];

void searchToc(List<TocEntry> entries) {
for (final TocEntry entry in entries) {
alltocs.add(entry);
for (final child in entry.children) {
child.text = '${entry.text},${child.text}';
}
searchToc(entry.children);
}
}

searchToc(toc);
for (final TocEntry entry in alltocs) {
final ref = Ref(
id: isar.refs.autoIncrement(),
ref: entry.text
.replaceAll('"', '')
.replaceAll("'", '')
.replaceAll('״', ''),
bookTitle: book.title,
index: entry.index,
pdfBook: false);
refs.add(ref);
}
isar.write((isar) => isar.refs.putAll(refs));
print('Done creating refs for ${book.title} ');
} catch (e) {
print(' Failed creating refs for ${book.title} $e');
}
}
final pdfBooks =
library.getAllBooks().whereType<PdfBook>().skip(startIndex).toList();
refsNumOfbooksTotal.value = pdfBooks.length;
for (int i = 0; i < pdfBooks.length; i++) {
refsNumOfbooksDone.value = i;
final List<PdfOutlineNode> outlines =
await PdfDocument.openFile(pdfBooks[i].path)
.then((value) => value.loadOutline());

//get all TocEntries recursively
List<PdfOutlineNode> alloutlines = [];

void searchOutline(List<PdfOutlineNode> entries) {
for (final PdfOutlineNode entry in entries) {
alloutlines.add(entry);
searchOutline(entry.children);
}
}

searchOutline(outlines);

for (final PdfOutlineNode entry in alloutlines) {
final ref = Ref(
id: isar.refs.autoIncrement(),
ref: "${pdfBooks[i].title} ${entry.title}",
bookTitle: pdfBooks[i].title,
index: entry.dest?.pageNumber ?? 0,
pdfBook: true,
pdfPath: pdfBooks[i].path,
);
print('Adding Pdf ref: ${ref.ref}');
isar.write((isar) => isar.refs.put(ref));
}
}
refsNumOfbooksDone.value = null;
refsNumOfbooksTotal.value = null;
}

List<Ref> getRefsForBook(TextBook book) {
return isar.refs.where().bookTitleEqualTo(book.title).findAll();
}

List<Ref> getAllRefs() {
return isar.refs.where().findAll();
}

Future<List<Ref>> findRefs(String ref) {
final parts = ref.split(' ');
return isar.refs
.where()
.allOf(
parts,
(q, element) => q.refContains(element),
)
.findAllAsync();
}

Future<List<Ref>> findRefsByRelevance(String ref, {int limit = 10}) async {
var refs = await findRefs(ref);
// reduce the number of refs by taking the top N of each book
refs = await Isolate.run(() {
List<Ref> takenRefs = [];
final gruops = refs.groupBy((ref) => ref.bookTitle);
for (final gruop in gruops.keys) {
takenRefs += (gruops[gruop]!.take(limit)).toList();
}
takenRefs.sort((a, b) {
final scoreA = ratio(ref, a.ref);
final scoreB = ratio(ref, b.ref);
return scoreB.compareTo(scoreA);
});
return takenRefs;
});

// sort by ratio

return refs;
}

Future<int> getNumberOfBooksWithRefs() async {
final allRefs = isar.refs.where().findAll();
final books = allRefs.groupBy((ref) => ref.bookTitle);
return books.length;
}

Future<void> addAllLines(Library library) async {
final books = library.getAllBooks().whereType<TextBook>().toList();
linesNumOfbooksTotal.value = books.length;
linesNumOfbooksDone.value = 0;

for (TextBook book in books) {
print('Adding lines for ${book.title}');
await addLinesForBook(book);
linesNumOfbooksDone.value = books.indexOf(book) + 1;
}
}

Future<void> addLinesForBook(TextBook book) async {
final texts = (await book.text).split('\n');
final List<Line> lines = [];

for (int i = 0; i < texts.length; i++) {
final line = Line(
id: isar.lines.autoIncrement(),
text: texts[i],
bookTitle: book.title,
topics: book.topics,
index: i,
);

lines.add(line);
}

isar.write((isar) => isar.lines.putAll(lines));
}

Future<List<Line>> getLinesForBook(TextBook book) async {
return isar.lines.where().bookTitleEqualTo(book.title).findAll();
}

Future<List<Line>> getAllLines() async {
return isar.lines.where().findAll();
}

Future<List<Line>> findLines(String text) async {
return isar.lines.where().textContains(text).findAllAsync();
}
}

extension Iterables<E> on Iterable<E> {
Map<K, List<E>> groupBy<K>(K Function(E) keyFunction) => fold(
<K, List<E>>{},
(Map<K, List<E>> map, E element) =>
map..putIfAbsent(keyFunction(element), () => <E>[]).add(element));
}
Loading

0 comments on commit dfd19c0

Please sign in to comment.