Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use file handles when searching large files #59 #87

Merged
merged 1 commit into from
Oct 29, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 45 additions & 6 deletions classes/file_search.php
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ class file_search {
*/
const CSV_MATCH = 14;

/** @var int Chunk size for splitting. 10MB to make border cases rare. */
const CHUNK_SIZE = 10 * 1024 * 1024;

/**
* Transforms a file record into critera for a where clause
* @param \tool_advancedreplace\files $record
Expand Down Expand Up @@ -344,6 +347,45 @@ public static function grep_content($csv, $filecontents, $criteria, $stream): in
return $matchcount;
}

/**
* Handles processing of a grep search by splitting files into smaller chunks when required.
*
* @param array $csv Some columns to be output in the csv file.
* @param \stored_file|\ZipArchive $file The file or zip archive.
* @param object $criteria The regular expression to be matched.
* @param resource $stream The stream handle for the output file.
* @param array $zipstat information about the file inside a zip full.
* @return int $matchcount The number of matches found.
*/
public static function grep_processor($csv, $file, $criteria, $stream, $zipstat = []) {
$storedfile = $file instanceof \stored_file;

// For files smaller than chunk size, just open them directly.
$filesize = $storedfile ? $file->get_filesize() : $zipstat['size'];
if ($filesize < self::CHUNK_SIZE) {
$content = $storedfile ? $file->get_content() : $file->getFromIndex($zipstat['index']);
return self::grep_content($csv, $content, $criteria, $stream);
}

// For large files, use a file handle stream instead.
$matchcount = 0;
$handle = $storedfile ? $file->get_content_file_handle() : $file->getStream($file->getNameIndex($zipstat['index']));
if (empty($handle)) {
return $matchcount;
}

$prev = '';
while (!feof($handle)) {
// Prepend the last few characters to the next iteration so no border cases are missed.
$chunk = $prev . fread($handle, self::CHUNK_SIZE);
brendanheywood marked this conversation as resolved.
Show resolved Hide resolved
$matchcount += self::grep_content($csv, $chunk, $criteria, $stream);
$prev = substr($chunk, -100);
}

fclose($handle);
return $matchcount;
}

/**
* Search for the pattern in (the subfiles of ) a zip file.
*
Expand Down Expand Up @@ -389,9 +431,8 @@ public static function unzip_content(array $csv, \stored_file $file, object $cri
continue;
}
}

$csv[self::CSV_INTERNAL] = $stat['name'];
$matchcount += self::grep_content($csv, $zip->getFromIndex($i), $criteria, $stream);
$matchcount += self::grep_processor($csv, $zip, $criteria, $stream, $stat);
}
$zip->close();
}
Expand Down Expand Up @@ -447,15 +488,13 @@ public static function search_file(object $filerecord, object $criteria, $stream
switch ($filerecord->mimetype) {
case 'application/zip.h5p':
case 'application/zip':
if (empty($criteria->openzips)) {
$matchcount = 0;
} else {
if (!empty($criteria->openzips)) {
$csv[self::CSV_STRATEGY] = 'zip';
$matchcount = self::unzip_content($csv, $file, $criteria, $stream);
}
break;
default:
$matchcount = self::grep_content($csv, $file->get_content(), $criteria, $stream);
$matchcount = self::grep_processor($csv, $file, $criteria, $stream);
break;
}
return $matchcount;
Expand Down
Loading