WIP

catalyst · Oct 29, 2024 · 5d884a4 · 5d884a4
1 parent dc4ed22
commit 5d884a4
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 60 deletions.
diff --git a/classes/file_search.php b/classes/file_search.php
@@ -148,11 +148,11 @@ public static function get_criteria(files $record): object {
      *
      * @param \tool_advancedreplace\files $record
      * @param string $output path
-     * @param int $limitfrom limit for sql
-     * @param int $limitnum limit for sql
+     * @param int $startid minimum id for sql
+     * @param int $endid maximum id for sql
      * @return void
      */
-    public static function files(files $record, string $output = '', int $limitfrom = 0, int $limitnum = 0) {
+    public static function files(files $record, string $output = '', int $startid = 0, int $endid = 0) {
         global $DB;
         \core_php_time_limit::raise();
         raise_memory_limit(MEMORY_HUGE);
@@ -169,22 +169,26 @@ public static function files(files $record, string $output = '', int $limitfrom
         }
 
         [$whereclause, $params] = self::make_where_clause($criteria);
+        // If we are running a shard, then restrict the range of id.
+        if ( ! empty($startid) || ! empty($endid)) {
+            $whereclause .= ' AND f.id between :startid and :endid';
+            $params['startid'] = $startid;
+            $params['endid'] = $endid;
+        }
 
         // If the output file already exists, try to resume.
         if (file_exists($output)) {
             // This must be a resumed job. We need to append to previous output.
-            [$lastline, $linecount] = helper::read_last_line($output);
-            $resumemark = self::make_resume_mark($lastline);
-            $matchcount = ($linecount > 0) ? $linecount - 1 : 0;
+            [$resumeid, $matchcount] = self::resume($output);
         } else {
-            $resumemark = '';
-            $matchcount = 0;
+            $resumeid = 0 ;
+            $matchcount = 0 ;
         }
-        if ($resumemark != '') {
+        print "Operating with resumeid=$resumid and matchcount is $matchcount\n";
+        if ( ! empty($resumeid)) {
             $stream = fopen($output, 'a');
-            $whereclause .=
-                " AND f.component || '-' || f.filearea || '-' || f.contextid || '-' || f.itemid || '-' || f.id >= :resumemark ";
-            $params['resumemark'] = $resumemark;
+            $whereclause .= ' AND f.id >= :resumeid ';
+            $params['resumeid'] = $resumeid;
         } else {
             $stream = fopen($output, 'w');
             $columnheaders = [
@@ -194,15 +198,13 @@ public static function files(files $record, string $output = '', int $limitfrom
             fputcsv($stream, $columnheaders);
         }
 
+
         $record->set('timestart', time());
         $updatetime = time();
         $updatepercent = 0;
         $filecount = 0;
         $total = $DB->get_record_sql("SELECT COUNT('x') total FROM {files} f WHERE " . $whereclause, $params);
         $totalfiles = $total->total;
-        if (!empty($limitnum)) {
-            $totalfiles = min($totalfiles, $limitnum);
-        }
         $sql = "
             SELECT
                 f.id, f.component, f.filearea, f.contextid, f.itemid, f.filename, f.filepath, f.mimetype,
@@ -215,10 +217,11 @@ public static function files(files $record, string $output = '', int $limitfrom
                                                WHEN ctx.contextlevel = 70 THEN cm.course
                                            END
             WHERE $whereclause
-            ORDER BY f.component, f.filearea, f.contextid, f.itemid, f.id
+            ORDER BY f.id
         ";
-        $fileset = $DB->get_recordset_sql($sql, $params, $limitfrom, $limitnum);
+        $fileset = $DB->get_recordset_sql($sql, $params);
         foreach ($fileset as $filerecord) {
+            sleep (5); //-------// remove before flight   
             $matchcount += self::search_file($filerecord, $criteria, $stream);
             $filecount ++;
             $time = time();
@@ -588,44 +591,86 @@ public static function make_where_clause(object $criteria): array {
     }
 
     /**
-     * Create a where clause to facilitate resumption after a crash.
-     * eg   AND f.component ||'-'|| f.filearea ||'-'|| >
-     * @param string $lastline of the previous output file.
+     * Determine the id of a line from the output csv file.
+     * 
+     * @param string $line of an output file.
      */
-    public static function make_resume_mark(string $lastline) {
+    public static function get_id_from_csv(string $line): int {
         // Interpret the last line as a csv line.
-        $csv = str_getcsv($lastline);
+        $csv = str_getcsv($line);
+        print "Testing $line\n";
+        print_r($csv);
+        // Check a few columns to ensure we have a valid line
         if (empty($csv[self::CSV_COMPONENT])) {
-            return '';
+            print "Bad A\n";
+            return 0;
         }
         if (empty($csv[self::CSV_FILEAREA])) {
-            return '';
+            print "Bad B\n";
+            return 0;
         }
         if (empty($csv[self::CSV_CONTEXTID])) {
-            return '';
-        }
-        if (empty($csv[self::CSV_ITEMID])) {
-            return '';
+            print "Bad C\n";
+            return 0;
         }
         if (empty($csv[self::CSV_FILEID])) {
-            return '';
-        }
-        if ($csv[self::CSV_COMPONENT] == 'component') {
-            // It looks like we got the header line.
-            // That is, there was no data in the csv file.
-            return '';
+            print "Bad D\n";
+            return 0;
         }
-        $resumemark =
-            $csv[self::CSV_COMPONENT] . '-' .
-            $csv[self::CSV_FILEAREA] . '-' .
-            $csv[self::CSV_CONTEXTID] . '-' .
-            $csv[self::CSV_ITEMID] . '-' .
-            $csv[self::CSV_FILEID];
-
-        return $resumemark;
-
+        return $csv[self::CSV_FILEID];
     }
 
+    /**
+     * Look at the previous output file to decide how to resume.
+     *
+     * @param string $filename
+     * @return int $resumeid The first id that should be scanned.
+     * @return int $matchcount The number of matches left in the file.
+     */
+    public static function resume(string $filename): array {
+        if ( ! file_exists($filename)) {
+            print "No file\n";
+            return [0,0];
+        }
+
+        $lines = file($filename, FILE_IGNORE_NEW_LINES);
+        if (count($lines) < 3) {
+            // Too small to resume
+            print "Too samll A\n";
+            return [0,0];
+        }
+        $lastline = end($lines);
+        $resumeid = self::get_id_from_csv($lastline);
+        if (empty($csv)) {
+            array_pop($lines);
+            $csv = self::get_id_from_csv($lastline);
+            if (empty($csv)) {
+                // If last two lines are bad, give up. 
+                print "bad lines\n";
+                return [0,0];
+            }
+        }
+        // Now remove all lines that match this id.
+        while (true) {
+            array_pop($lines);
+            $line = end($lines);
+            $lineid = self::get_id_from_csv($line);
+            if ($lineid != $resumeid) {
+                // Leave this line in place.
+                break;
+            }
+            if ( count($lines) < 3 ) {
+                // Too small to resume
+                print "Now too small\n";
+                return [0,0];
+            }
+        } 
+        // Re-write the file, without the matched id lines.
+        file_put_contents($filename, implode("\n", $lines) . "\n");
+        $matchcount = count($lines) - 1;
+        print "UIsing $resumeid and $matchcount \n";
+        return [$resumeid, $matchcount];
+    }
 }
 
 

diff --git a/classes/search.php b/classes/search.php
@@ -76,19 +76,19 @@ protected function after_delete($result): void {
 
     /**
      * Queues a search task to be run
-     * @param int $limitfrom where to start the search sql
-     * @param int $limitnum limit the number of sql results
+     * @param int $startid minimum id to be included
+     * @param int $endid maximum id to be included
      * @return bool true if the task was queued
      */
-    public function queue_task(int $limitfrom = 0, int $limitnum = 0): bool {
+    public function queue_task(int $startid = 0, int $endid = 0): bool {
         $adhoctask = new $this->adhoctask;
         $customdata = [
             'searchid' => $this->get('id'),
         ];
         // If we have either limits we should include both.
-        if (!empty($limitfrom) || !empty($limitnum)) {
-            $customdata['limitfrom'] = $limitfrom;
-            $customdata['limitnum'] = $limitnum;
+        if (!empty($startid) || !empty($endid)) {
+            $customdata['startid'] = $startid;
+            $customdata['startid'] = $endid;
         }
         $adhoctask->set_custom_data($customdata);
         return \core\task\manager::queue_adhoc_task($adhoctask);

diff --git a/classes/task/files.php b/classes/task/files.php
@@ -48,7 +48,7 @@ public function execute() {
             return self::spawn_shards($record);
         }
 
-        \tool_advancedreplace\file_search::files($record, '', $data->limitfrom ?? 0, $data->limitnum ?? 0);
+        \tool_advancedreplace\file_search::files($record, '', $data->startid ?? 0, $data->endid ?? 0);
     }
 
     /**
@@ -63,16 +63,17 @@ public static function spawn_shards(\tool_advancedreplace\files $record): void {
         $numshards = $record->get('shards');
         $timestart = time();
 
-        // Get the total number of files.
-        $criteria = \tool_advancedreplace\file_search::get_criteria($record);
-        [$whereclause, $params] = \tool_advancedreplace\file_search::make_where_clause($criteria);
-        $totalfiles = $DB->count_records_select('files', $whereclause, $params);
-
+        // The sharding will be controlled by ranges of the id column of the mdl_files table. 
+        // Here we implement a simple division. (In future we could use WHERE clause to choose more even break points.)
+
+        $maxid = $DB->get_field_sql('SELECT MAX(id) FROM {files}');
+        
         // Create and spawn new tasks.
         $search = new \tool_advancedreplace\files($searchid);
         $basedata = $search->copy_data(true);
-        $limitfrom = 0;
-        $limitnum = ceil($totalfiles / $numshards);
+        $shardsize = ceil($maxid / $numshards);
+        $startid = 0;
+        $endid = $startid + $shardsize -1;
         $shardnum = 0;
         while ($shardnum < $numshards) {
             $shardnum++;
@@ -82,11 +83,12 @@ public static function spawn_shards(\tool_advancedreplace\files $record): void {
 
             // Remove the limit on the last shard.
             if ($shardnum === $numshards) {
-                $limitnum = 0;
+                $endid = $maxid;
             }
 
-            $shard->queue_task($limitfrom, $limitnum);
-            $limitfrom += $limitnum;
+            $shard->queue_task($startid, $endid);
+            $startid += $shardsize;
+            $endid += $shardsize;
         }
 
         // Update the start time on the parent.