-
Notifications
You must be signed in to change notification settings - Fork 94
/
FileDataReader.php
117 lines (96 loc) · 3.16 KB
/
FileDataReader.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
<?php
namespace LLPhant\Embeddings\DataReader;
use LLPhant\Embeddings\Document;
use Smalot\PdfParser\Parser;
final class FileDataReader implements DataReader
{
public string $sourceType = 'files';
/**
* @template T of Document
*
* @param class-string<T> $documentClassName
* @param string[] $extensions
*/
public function __construct(public string $filePath, public readonly string $documentClassName = Document::class, private readonly array $extensions = [])
{
}
/**
* @return Document[]
*/
public function getDocuments(): array
{
if (! file_exists($this->filePath)) {
return [];
}
// If it's a directory
if (is_dir($this->filePath)) {
return $this->getDocumentsFromDirectory($this->filePath);
}
// If it's a file
$content = $this->getContentFromFile($this->filePath);
if ($content === false) {
return [];
}
return [$this->getDocument($content, $this->filePath)];
}
/**
* @return Document[]
*/
private function getDocumentsFromDirectory(string $directory): array
{
$documents = [];
// Open the directory
if ($handle = opendir($directory)) {
// Read the directory contents
while (($entry = readdir($handle)) !== false) {
$fullPath = $directory.'/'.$entry;
if ($entry != '.' && $entry != '..') {
if (is_dir($fullPath)) {
$documents = [...$documents, ...$this->getDocumentsFromDirectory($fullPath)];
} else {
$content = $this->getContentFromFile($fullPath);
if ($content !== false) {
$documents[] = $this->getDocument($content, $entry);
}
}
}
}
// Close the directory
closedir($handle);
}
return $documents;
}
private function getContentFromFile(string $path): string|false
{
$fileExtension = strtolower(pathinfo($path, PATHINFO_EXTENSION));
if (! $this->validExtension($fileExtension)) {
return false;
}
if ($fileExtension === 'pdf') {
$parser = new Parser();
$pdf = $parser->parseFile($path);
return $pdf->getText();
}
if ($fileExtension === 'docx') {
$docxReader = new DocxReader();
return $docxReader->getText($path);
}
return file_get_contents($path);
}
private function getDocument(string $content, string $entry): mixed
{
$document = new $this->documentClassName();
$document->content = $content;
$document->sourceType = $this->sourceType;
$document->sourceName = $entry;
$document->hash = \hash('sha256', $content);
return $document;
}
private function validExtension(string $fileExtension): bool
{
if ($this->extensions === []) {
return true;
}
return in_array($fileExtension, $this->extensions);
}
}