diff --git a/classes/file_search.php b/classes/file_search.php index ba48105..eab5807 100644 --- a/classes/file_search.php +++ b/classes/file_search.php @@ -148,11 +148,11 @@ public static function get_criteria(files $record): object { * * @param \tool_advancedreplace\files $record * @param string $output path - * @param int $limitfrom limit for sql - * @param int $limitnum limit for sql + * @param int $startid minimum id for sql + * @param int $endid maximum id for sql * @return void */ - public static function files(files $record, string $output = '', int $limitfrom = 0, int $limitnum = 0) { + public static function files(files $record, string $output = '', int $startid = 0, int $endid = 0) { global $DB; \core_php_time_limit::raise(); raise_memory_limit(MEMORY_HUGE); @@ -169,22 +169,26 @@ public static function files(files $record, string $output = '', int $limitfrom } [$whereclause, $params] = self::make_where_clause($criteria); + // If we are running a shard, then restrict the range of id. + if ( ! empty($startid) || ! empty($endid)) { + $whereclause .= ' AND f.id between :startid and :endid'; + $params['startid'] = $startid; + $params['endid'] = $endid; + } // If the output file already exists, try to resume. if (file_exists($output)) { // This must be a resumed job. We need to append to previous output. - [$lastline, $linecount] = helper::read_last_line($output); - $resumemark = self::make_resume_mark($lastline); - $matchcount = ($linecount > 0) ? $linecount - 1 : 0; + [$resumeid, $matchcount] = self::resume($output); } else { - $resumemark = ''; - $matchcount = 0; + $resumeid = 0 ; + $matchcount = 0 ; } - if ($resumemark != '') { + print "Operating with resumeid=$resumid and matchcount is $matchcount\n"; + if ( ! empty($resumeid)) { $stream = fopen($output, 'a'); - $whereclause .= - " AND f.component || '-' || f.filearea || '-' || f.contextid || '-' || f.itemid || '-' || f.id >= :resumemark "; - $params['resumemark'] = $resumemark; + $whereclause .= ' AND f.id >= :resumeid '; + $params['resumeid'] = $resumeid; } else { $stream = fopen($output, 'w'); $columnheaders = [ @@ -194,15 +198,13 @@ public static function files(files $record, string $output = '', int $limitfrom fputcsv($stream, $columnheaders); } + $record->set('timestart', time()); $updatetime = time(); $updatepercent = 0; $filecount = 0; $total = $DB->get_record_sql("SELECT COUNT('x') total FROM {files} f WHERE " . $whereclause, $params); $totalfiles = $total->total; - if (!empty($limitnum)) { - $totalfiles = min($totalfiles, $limitnum); - } $sql = " SELECT f.id, f.component, f.filearea, f.contextid, f.itemid, f.filename, f.filepath, f.mimetype, @@ -215,10 +217,11 @@ public static function files(files $record, string $output = '', int $limitfrom WHEN ctx.contextlevel = 70 THEN cm.course END WHERE $whereclause - ORDER BY f.component, f.filearea, f.contextid, f.itemid, f.id + ORDER BY f.id "; - $fileset = $DB->get_recordset_sql($sql, $params, $limitfrom, $limitnum); + $fileset = $DB->get_recordset_sql($sql, $params); foreach ($fileset as $filerecord) { + sleep (5); //-------// remove before flight $matchcount += self::search_file($filerecord, $criteria, $stream); $filecount ++; $time = time(); @@ -588,44 +591,86 @@ public static function make_where_clause(object $criteria): array { } /** - * Create a where clause to facilitate resumption after a crash. - * eg AND f.component ||'-'|| f.filearea ||'-'|| > - * @param string $lastline of the previous output file. + * Determine the id of a line from the output csv file. + * + * @param string $line of an output file. */ - public static function make_resume_mark(string $lastline) { + public static function get_id_from_csv(string $line): int { // Interpret the last line as a csv line. - $csv = str_getcsv($lastline); + $csv = str_getcsv($line); + print "Testing $line\n"; + print_r($csv); + // Check a few columns to ensure we have a valid line if (empty($csv[self::CSV_COMPONENT])) { - return ''; + print "Bad A\n"; + return 0; } if (empty($csv[self::CSV_FILEAREA])) { - return ''; + print "Bad B\n"; + return 0; } if (empty($csv[self::CSV_CONTEXTID])) { - return ''; - } - if (empty($csv[self::CSV_ITEMID])) { - return ''; + print "Bad C\n"; + return 0; } if (empty($csv[self::CSV_FILEID])) { - return ''; - } - if ($csv[self::CSV_COMPONENT] == 'component') { - // It looks like we got the header line. - // That is, there was no data in the csv file. - return ''; + print "Bad D\n"; + return 0; } - $resumemark = - $csv[self::CSV_COMPONENT] . '-' . - $csv[self::CSV_FILEAREA] . '-' . - $csv[self::CSV_CONTEXTID] . '-' . - $csv[self::CSV_ITEMID] . '-' . - $csv[self::CSV_FILEID]; - - return $resumemark; - + return $csv[self::CSV_FILEID]; } + /** + * Look at the previous output file to decide how to resume. + * + * @param string $filename + * @return int $resumeid The first id that should be scanned. + * @return int $matchcount The number of matches left in the file. + */ + public static function resume(string $filename): array { + if ( ! file_exists($filename)) { + print "No file\n"; + return [0,0]; + } + + $lines = file($filename, FILE_IGNORE_NEW_LINES); + if (count($lines) < 3) { + // Too small to resume + print "Too samll A\n"; + return [0,0]; + } + $lastline = end($lines); + $resumeid = self::get_id_from_csv($lastline); + if (empty($csv)) { + array_pop($lines); + $csv = self::get_id_from_csv($lastline); + if (empty($csv)) { + // If last two lines are bad, give up. + print "bad lines\n"; + return [0,0]; + } + } + // Now remove all lines that match this id. + while (true) { + array_pop($lines); + $line = end($lines); + $lineid = self::get_id_from_csv($line); + if ($lineid != $resumeid) { + // Leave this line in place. + break; + } + if ( count($lines) < 3 ) { + // Too small to resume + print "Now too small\n"; + return [0,0]; + } + } + // Re-write the file, without the matched id lines. + file_put_contents($filename, implode("\n", $lines) . "\n"); + $matchcount = count($lines) - 1; + print "UIsing $resumeid and $matchcount \n"; + return [$resumeid, $matchcount]; + } } diff --git a/classes/search.php b/classes/search.php index 19a23fd..3269000 100644 --- a/classes/search.php +++ b/classes/search.php @@ -76,19 +76,19 @@ protected function after_delete($result): void { /** * Queues a search task to be run - * @param int $limitfrom where to start the search sql - * @param int $limitnum limit the number of sql results + * @param int $startid minimum id to be included + * @param int $endid maximum id to be included * @return bool true if the task was queued */ - public function queue_task(int $limitfrom = 0, int $limitnum = 0): bool { + public function queue_task(int $startid = 0, int $endid = 0): bool { $adhoctask = new $this->adhoctask; $customdata = [ 'searchid' => $this->get('id'), ]; // If we have either limits we should include both. - if (!empty($limitfrom) || !empty($limitnum)) { - $customdata['limitfrom'] = $limitfrom; - $customdata['limitnum'] = $limitnum; + if (!empty($startid) || !empty($endid)) { + $customdata['startid'] = $startid; + $customdata['startid'] = $endid; } $adhoctask->set_custom_data($customdata); return \core\task\manager::queue_adhoc_task($adhoctask); diff --git a/classes/task/files.php b/classes/task/files.php index 4e61b47..a5e3c17 100644 --- a/classes/task/files.php +++ b/classes/task/files.php @@ -48,7 +48,7 @@ public function execute() { return self::spawn_shards($record); } - \tool_advancedreplace\file_search::files($record, '', $data->limitfrom ?? 0, $data->limitnum ?? 0); + \tool_advancedreplace\file_search::files($record, '', $data->startid ?? 0, $data->endid ?? 0); } /** @@ -63,16 +63,17 @@ public static function spawn_shards(\tool_advancedreplace\files $record): void { $numshards = $record->get('shards'); $timestart = time(); - // Get the total number of files. - $criteria = \tool_advancedreplace\file_search::get_criteria($record); - [$whereclause, $params] = \tool_advancedreplace\file_search::make_where_clause($criteria); - $totalfiles = $DB->count_records_select('files', $whereclause, $params); - + // The sharding will be controlled by ranges of the id column of the mdl_files table. + // Here we implement a simple division. (In future we could use WHERE clause to choose more even break points.) + + $maxid = $DB->get_field_sql('SELECT MAX(id) FROM {files}'); + // Create and spawn new tasks. $search = new \tool_advancedreplace\files($searchid); $basedata = $search->copy_data(true); - $limitfrom = 0; - $limitnum = ceil($totalfiles / $numshards); + $shardsize = ceil($maxid / $numshards); + $startid = 0; + $endid = $startid + $shardsize -1; $shardnum = 0; while ($shardnum < $numshards) { $shardnum++; @@ -82,11 +83,12 @@ public static function spawn_shards(\tool_advancedreplace\files $record): void { // Remove the limit on the last shard. if ($shardnum === $numshards) { - $limitnum = 0; + $endid = $maxid; } - $shard->queue_task($limitfrom, $limitnum); - $limitfrom += $limitnum; + $shard->queue_task($startid, $endid); + $startid += $shardsize; + $endid += $shardsize; } // Update the start time on the parent.