autodoc: Change dictionary sort order

This makes this more in line with Data::Dumper sorting. upper/lower case continues to not matter, and numbers continue to come after letters, so that ckWARN2() comes after plain ckWARN(). It changes non-leading underscores to come before letters, so that ck_warner comes before ckWARN. And it changes so leading underscores come after non-leading, so that aMY_CXT and aMY_CXT_ come before _aMY_CXT.
Perl · Jul 30, 2024 · 32eaf9d · 32eaf9d
1 parent 963e0b1
commit 32eaf9d
Showing 1 changed file with 22 additions and 27 deletions.
diff --git a/autodoc.pl b/autodoc.pl
@@ -2162,45 +2162,40 @@ sub dictionary_order {
     local $a = $a;
     local $b = $b;
 
-    # Convert all digit sequences to same length with leading zeros, so for
-    # example, 8 will compare less than 16 (using a fill length value that
-    # should be longer than any sequence in the input).
+    # Convert all digit sequences to be the same length with leading zeros, so
+    # that, for example '8' will sort before '16' (using a fill length value
+    # that should be longer than any sequence in the input).
     $a =~ s/(\d+)/sprintf "%06d", $1/ge;
     $b =~ s/(\d+)/sprintf "%06d", $1/ge;
 
-    # Translate any underscores and digits so they compare after all Unicode
-    # characters
-    $a =~ tr[_0-9]/\x{110000}-\x{11000A}/;
-    $b =~ tr[_0-9]/\x{110000}-\x{11000A}/;
+    # Translate any underscores so they sort lowest.  This causes 'word1_word2'
+    # to sort before 'word1word2' for all words.
+    # And translate any digits so they come after anything else.  This causes
+    #  digits to sort highest)
+    $a =~ tr[_0-9]/\0\x{110000}-\x{110009}/;
+    $b =~ tr[_0-9]/\0\x{110000}-\x{110009}/;
+
+    # Then move leading underscores to the end, translating them to above
+    # everything else.  This causes '_word_' to compare just after 'word_'
+    $a .= "\x{11000A}" x length $1 if $a =~ s/ ^ (\0+) //x;
+    $b .= "\x{11000A}" x length $1 if $b =~ s/ ^ (\0+) //x;
 
-    use feature 'state';
     # Modify \w, \W to reflect the changes.
-    state $ud = '\x{110000}-\x{11000A}';    # xlated underscore, digits
-    state $w = "\\w$ud";                    # new \w string
+    use feature 'state';
+    state $w = "\\w\0\x{110000}-\x{11000A}";   # new \w string
     state $mod_w = qr/[$w]/;
     state $mod_W = qr/[^$w]/;
 
-    # Only \w for initial comparison
-    my $a_only_word = uc($a =~ s/$mod_W//gr);
-    my $b_only_word = uc($b =~ s/$mod_W//gr);
-
-    # And not initial nor interior underscores nor digits (by squeezing them
-    # out)
-    my $a_stripped = $a_only_word =~ s/ (*atomic:[$ud]+) (*pla: $mod_w ) //grxx;
-    my $b_stripped = $b_only_word =~ s/ (*atomic:[$ud]+) (*pla: $mod_w ) //grxx;
+    # Strip out \W.
+    my $a_stripped = $a =~ s/$mod_W//gr;
+    my $b_stripped = $b =~ s/$mod_W//gr;
 
     # If the stripped versions differ, use that as the comparison.
-    my $cmp = $a_stripped cmp $b_stripped;
-    return $cmp if $cmp;
-
-    # For the first tie breaker, repeat, but consider initial and interior
-    # underscores and digits, again having those compare after all Unicode
-    # characters
-    $cmp = $a_only_word cmp $b_only_word;
+    my $cmp = lc $a_stripped cmp lc $b_stripped;
     return $cmp if $cmp;
 
-    # Next tie breaker is just a caseless comparison
-    $cmp = uc($a) cmp uc($b);
+    # For the first tie breaker use a plain caseless comparison
+    $cmp = lc $a  cmp lc $b;
     return $cmp if $cmp;
 
     # Finally a straight comparison