Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
picnicpete committed Oct 29, 2024
1 parent dc4ed22 commit 5d884a4
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 60 deletions.
131 changes: 88 additions & 43 deletions classes/file_search.php
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,11 @@ public static function get_criteria(files $record): object {
*
* @param \tool_advancedreplace\files $record
* @param string $output path
* @param int $limitfrom limit for sql
* @param int $limitnum limit for sql
* @param int $startid minimum id for sql
* @param int $endid maximum id for sql
* @return void
*/
public static function files(files $record, string $output = '', int $limitfrom = 0, int $limitnum = 0) {
public static function files(files $record, string $output = '', int $startid = 0, int $endid = 0) {
global $DB;
\core_php_time_limit::raise();
raise_memory_limit(MEMORY_HUGE);
Expand All @@ -169,22 +169,26 @@ public static function files(files $record, string $output = '', int $limitfrom
}

[$whereclause, $params] = self::make_where_clause($criteria);
// If we are running a shard, then restrict the range of id.
if ( ! empty($startid) || ! empty($endid)) {
$whereclause .= ' AND f.id between :startid and :endid';
$params['startid'] = $startid;
$params['endid'] = $endid;
}

// If the output file already exists, try to resume.
if (file_exists($output)) {
// This must be a resumed job. We need to append to previous output.
[$lastline, $linecount] = helper::read_last_line($output);
$resumemark = self::make_resume_mark($lastline);
$matchcount = ($linecount > 0) ? $linecount - 1 : 0;
[$resumeid, $matchcount] = self::resume($output);
} else {
$resumemark = '';
$matchcount = 0;
$resumeid = 0 ;
$matchcount = 0 ;
}
if ($resumemark != '') {
print "Operating with resumeid=$resumid and matchcount is $matchcount\n";
if ( ! empty($resumeid)) {
$stream = fopen($output, 'a');
$whereclause .=
" AND f.component || '-' || f.filearea || '-' || f.contextid || '-' || f.itemid || '-' || f.id >= :resumemark ";
$params['resumemark'] = $resumemark;
$whereclause .= ' AND f.id >= :resumeid ';
$params['resumeid'] = $resumeid;
} else {
$stream = fopen($output, 'w');
$columnheaders = [
Expand All @@ -194,15 +198,13 @@ public static function files(files $record, string $output = '', int $limitfrom
fputcsv($stream, $columnheaders);
}


$record->set('timestart', time());
$updatetime = time();
$updatepercent = 0;
$filecount = 0;
$total = $DB->get_record_sql("SELECT COUNT('x') total FROM {files} f WHERE " . $whereclause, $params);
$totalfiles = $total->total;
if (!empty($limitnum)) {
$totalfiles = min($totalfiles, $limitnum);
}
$sql = "
SELECT
f.id, f.component, f.filearea, f.contextid, f.itemid, f.filename, f.filepath, f.mimetype,
Expand All @@ -215,10 +217,11 @@ public static function files(files $record, string $output = '', int $limitfrom
WHEN ctx.contextlevel = 70 THEN cm.course
END
WHERE $whereclause
ORDER BY f.component, f.filearea, f.contextid, f.itemid, f.id
ORDER BY f.id
";
$fileset = $DB->get_recordset_sql($sql, $params, $limitfrom, $limitnum);
$fileset = $DB->get_recordset_sql($sql, $params);
foreach ($fileset as $filerecord) {
sleep (5); //-------// remove before flight
$matchcount += self::search_file($filerecord, $criteria, $stream);
$filecount ++;
$time = time();
Expand Down Expand Up @@ -588,44 +591,86 @@ public static function make_where_clause(object $criteria): array {
}

/**
* Create a where clause to facilitate resumption after a crash.
* eg AND f.component ||'-'|| f.filearea ||'-'|| >
* @param string $lastline of the previous output file.
* Determine the id of a line from the output csv file.
*
* @param string $line of an output file.
*/
public static function make_resume_mark(string $lastline) {
public static function get_id_from_csv(string $line): int {
// Interpret the last line as a csv line.
$csv = str_getcsv($lastline);
$csv = str_getcsv($line);
print "Testing $line\n";
print_r($csv);
// Check a few columns to ensure we have a valid line
if (empty($csv[self::CSV_COMPONENT])) {
return '';
print "Bad A\n";
return 0;
}
if (empty($csv[self::CSV_FILEAREA])) {
return '';
print "Bad B\n";
return 0;
}
if (empty($csv[self::CSV_CONTEXTID])) {
return '';
}
if (empty($csv[self::CSV_ITEMID])) {
return '';
print "Bad C\n";
return 0;
}
if (empty($csv[self::CSV_FILEID])) {
return '';
}
if ($csv[self::CSV_COMPONENT] == 'component') {
// It looks like we got the header line.
// That is, there was no data in the csv file.
return '';
print "Bad D\n";
return 0;
}
$resumemark =
$csv[self::CSV_COMPONENT] . '-' .
$csv[self::CSV_FILEAREA] . '-' .
$csv[self::CSV_CONTEXTID] . '-' .
$csv[self::CSV_ITEMID] . '-' .
$csv[self::CSV_FILEID];

return $resumemark;

return $csv[self::CSV_FILEID];
}

/**
* Look at the previous output file to decide how to resume.
*
* @param string $filename
* @return int $resumeid The first id that should be scanned.
* @return int $matchcount The number of matches left in the file.
*/
public static function resume(string $filename): array {
if ( ! file_exists($filename)) {
print "No file\n";
return [0,0];
}

$lines = file($filename, FILE_IGNORE_NEW_LINES);
if (count($lines) < 3) {
// Too small to resume
print "Too samll A\n";
return [0,0];
}
$lastline = end($lines);
$resumeid = self::get_id_from_csv($lastline);
if (empty($csv)) {
array_pop($lines);
$csv = self::get_id_from_csv($lastline);
if (empty($csv)) {
// If last two lines are bad, give up.
print "bad lines\n";
return [0,0];
}
}
// Now remove all lines that match this id.
while (true) {
array_pop($lines);
$line = end($lines);
$lineid = self::get_id_from_csv($line);
if ($lineid != $resumeid) {
// Leave this line in place.
break;
}
if ( count($lines) < 3 ) {
// Too small to resume
print "Now too small\n";
return [0,0];
}
}
// Re-write the file, without the matched id lines.
file_put_contents($filename, implode("\n", $lines) . "\n");
$matchcount = count($lines) - 1;
print "UIsing $resumeid and $matchcount \n";
return [$resumeid, $matchcount];
}
}


Expand Down
12 changes: 6 additions & 6 deletions classes/search.php
Original file line number Diff line number Diff line change
Expand Up @@ -76,19 +76,19 @@ protected function after_delete($result): void {

/**
* Queues a search task to be run
* @param int $limitfrom where to start the search sql
* @param int $limitnum limit the number of sql results
* @param int $startid minimum id to be included
* @param int $endid maximum id to be included
* @return bool true if the task was queued
*/
public function queue_task(int $limitfrom = 0, int $limitnum = 0): bool {
public function queue_task(int $startid = 0, int $endid = 0): bool {
$adhoctask = new $this->adhoctask;
$customdata = [
'searchid' => $this->get('id'),
];
// If we have either limits we should include both.
if (!empty($limitfrom) || !empty($limitnum)) {
$customdata['limitfrom'] = $limitfrom;
$customdata['limitnum'] = $limitnum;
if (!empty($startid) || !empty($endid)) {
$customdata['startid'] = $startid;
$customdata['startid'] = $endid;
}
$adhoctask->set_custom_data($customdata);
return \core\task\manager::queue_adhoc_task($adhoctask);
Expand Down
24 changes: 13 additions & 11 deletions classes/task/files.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public function execute() {
return self::spawn_shards($record);
}

\tool_advancedreplace\file_search::files($record, '', $data->limitfrom ?? 0, $data->limitnum ?? 0);
\tool_advancedreplace\file_search::files($record, '', $data->startid ?? 0, $data->endid ?? 0);
}

/**
Expand All @@ -63,16 +63,17 @@ public static function spawn_shards(\tool_advancedreplace\files $record): void {
$numshards = $record->get('shards');
$timestart = time();

// Get the total number of files.
$criteria = \tool_advancedreplace\file_search::get_criteria($record);
[$whereclause, $params] = \tool_advancedreplace\file_search::make_where_clause($criteria);
$totalfiles = $DB->count_records_select('files', $whereclause, $params);

// The sharding will be controlled by ranges of the id column of the mdl_files table.
// Here we implement a simple division. (In future we could use WHERE clause to choose more even break points.)

$maxid = $DB->get_field_sql('SELECT MAX(id) FROM {files}');
// Create and spawn new tasks.
$search = new \tool_advancedreplace\files($searchid);
$basedata = $search->copy_data(true);
$limitfrom = 0;
$limitnum = ceil($totalfiles / $numshards);
$shardsize = ceil($maxid / $numshards);
$startid = 0;
$endid = $startid + $shardsize -1;
$shardnum = 0;
while ($shardnum < $numshards) {
$shardnum++;
Expand All @@ -82,11 +83,12 @@ public static function spawn_shards(\tool_advancedreplace\files $record): void {

// Remove the limit on the last shard.
if ($shardnum === $numshards) {
$limitnum = 0;
$endid = $maxid;
}

$shard->queue_task($limitfrom, $limitnum);
$limitfrom += $limitnum;
$shard->queue_task($startid, $endid);
$startid += $shardsize;
$endid += $shardsize;
}

// Update the start time on the parent.
Expand Down

0 comments on commit 5d884a4

Please sign in to comment.