modules/fallback/Sort/Naturally.pm

   1
   2 require 5;
   3 package Sort::Naturally;  # Time-stamp: "2004-12-29 18:30:03 AST"
   4 $VERSION = '1.02';
   5 @EXPORT = ('nsort', 'ncmp');
   6 require Exporter;
   7 @ISA = ('Exporter');
   8
   9 use strict;
  10 use locale;
  11 use integer;
  12
  13 #-----------------------------------------------------------------------------
  14 # constants:
  15 BEGIN { *DEBUG = sub () {0} unless defined &DEBUG }
  16
  17 use Config ();
  18 BEGIN {
  19   # Make a constant such that if a whole-number string is that long
  20   #  or shorter, we KNOW it's treatable as an integer
  21   no integer;
  22   my $x = length(256 ** $Config::Config{'intsize'} / 2) - 1;
  23   die "Crazy intsize: <$Config::Config{'intsize'}>" if $x < 4;
  24   eval 'sub MAX_INT_SIZE () {' . $x . '}';
  25   die $@ if $@;
  26   print "intsize $Config::Config{'intsize'} => MAX_INT_SIZE $x\n" if DEBUG;
  27 }
  28
  29 sub X_FIRST () {-1}
  30 sub Y_FIRST () { 1}
  31
  32 my @ORD = ('same', 'swap', 'asis');
  33
  34 #-----------------------------------------------------------------------------
  35 # For lack of a preprocessor:
  36
  37 my($code, $guts);
  38 $guts = <<'EOGUTS';  # This is the guts of both ncmp and nsort:
  39
  40     if($x eq $y) {
  41       # trap this expensive case first, and then fall thru to tiebreaker
  42       $rv = 0;
  43
  44     # Convoluted hack to get numerics to sort first, at string start:
  45     } elsif($x =~ m/^\d/s) {
  46       if($y =~ m/^\d/s) {
  47         $rv = 0;    # fall thru to normal comparison for the two numbers
  48       } else {
  49         $rv = X_FIRST;
  50         DEBUG > 1 and print "Numeric-initial $x trumps letter-initial $y\n";
  51       }
  52     } elsif($y =~ m/^\d/s) {
  53       $rv = Y_FIRST;
  54       DEBUG > 1 and print "Numeric-initial $y trumps letter-initial $x\n";
  55     } else {
  56       $rv = 0;
  57     }
  58
  59     unless($rv) {
  60       # Normal case:
  61       $rv = 0;
  62       DEBUG and print "<$x> and <$y> compared...\n";
  63
  64      Consideration:
  65       while(length $x and length $y) {
  66
  67         DEBUG > 2 and print " <$x> and <$y>...\n";
  68
  69         # First, non-numeric comparison:
  70         $x2 = ($x =~ m/^(\D+)/s) ? length($1) : 0;
  71         $y2 = ($y =~ m/^(\D+)/s) ? length($1) : 0;
  72         # Now make x2 the min length of the two:
  73         $x2 = $y2 if $x2 > $y2;
  74         if($x2) {
  75           DEBUG > 1 and printf " <%s> and <%s> lexically for length $x2...\n",
  76             substr($x,0,$x2), substr($y,0,$x2);
  77           do {
  78            my $i = substr($x,0,$x2);
  79            my $j = substr($y,0,$x2);
  80            my $sv = $i cmp $j;
  81            print "SCREAM! on <$i><$j> -- $sv != $rv \n" unless $rv == $sv;
  82            last;
  83           }
  84
  85
  86            if $rv =
  87            # The ''. things here force a copy that seems to work around a
  88            #  mysterious intermittent bug that 'use locale' provokes in
  89            #  many versions of Perl.
  90                    $cmp
  91                    ? $cmp->(substr($x,0,$x2) . '',
  92                             substr($y,0,$x2) . '',
  93                            )
  94                    :
  95                    scalar(( substr($x,0,$x2) . '' ) cmp
  96                           ( substr($y,0,$x2) . '' )
  97                           )
  98           ;
  99           # otherwise trim and keep going:
 100           substr($x,0,$x2) = '';
 101           substr($y,0,$x2) = '';
 102         }
 103
 104         # Now numeric:
 105         #  (actually just using $x2 and $y2 as scratch)
 106
 107         if( $x =~ s/^(\d+)//s ) {
 108           $x2 = $1;
 109           if( $y =~ s/^(\d+)//s ) {
 110             # We have two numbers here.
 111             DEBUG > 1 and print " <$x2> and <$1> numerically\n";
 112             if(length($x2) < MAX_INT_SIZE and length($1) < MAX_INT_SIZE) {
 113               # small numbers: we can compare happily
 114               last if $rv = $x2 <=> $1;
 115             } else {
 116               # ARBITRARILY large integers!
 117
 118               # This saves on loss of precision that could happen
 119               #  with actual stringification.
 120               # Also, I sense that very large numbers aren't too
 121               #  terribly common in sort data.
 122
 123               # trim leading 0's:
 124               ($y2 = $1) =~ s/^0+//s;
 125               $x2 =~ s/^0+//s;
 126               print "   Treating $x2 and $y2 as bigint\n" if DEBUG;
 127
 128               no locale; # we want the dumb cmp back.
 129               last if $rv = (
 130                  # works only for non-negative whole numbers:
 131                  length($x2) <=> length($y2)
 132                    # the longer the numeral, the larger the value
 133                  or $x2 cmp $y2
 134                    # between equals, compare lexically!!  amazing but true.
 135               );
 136             }
 137           } else {
 138             # X is numeric but Y isn't
 139             $rv = Y_FIRST;
 140             last;
 141           }
 142         } elsif( $y =~ s/^\d+//s ) {  # we don't need to capture the substring
 143           $rv = X_FIRST;
 144           last;
 145         }
 146          # else one of them is 0-length.
 147
 148        # end-while
 149       }
 150     }
 151 EOGUTS
 152
 153 sub maker {
 154   my $code = $_[0];
 155   $code =~ s/~COMPARATOR~/$guts/g || die "Can't find ~COMPARATOR~";
 156   eval $code;
 157   die $@ if $@;
 158 }
 159
 160 ##############################################################################
 161
 162 maker(<<'EONSORT');
 163 sub nsort {
 164   # get options:
 165   my($cmp, $lc);
 166   ($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
 167
 168   return @_ unless @_ > 1 or wantarray; # be clever
 169
 170   my($x, $x2, $y, $y2, $rv);  # scratch vars
 171
 172   # We use a Schwartzian xform to memoize the lc'ing and \W-removal
 173
 174   map $_->[0],
 175   sort {
 176     if($a->[0] eq $b->[0]) { 0 }   # trap this expensive case
 177     else {
 178
 179     $x = $a->[1];
 180     $y = $b->[1];
 181
 182 ~COMPARATOR~
 183
 184     # Tiebreakers...
 185     DEBUG > 1 and print " -<${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
 186     $rv ||= (length($x) <=> length($y))  # shorter is always first
 187         ||  ($cmp and $cmp->($x,$y) || $cmp->($a->[0], $b->[0]))
 188         ||  ($x      cmp $y     )
 189         ||  ($a->[0] cmp $b->[0])
 190     ;
 191
 192     DEBUG > 1 and print "  <${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
 193     $rv;
 194   }}
 195
 196   map {;
 197     $x = $lc ? $lc->($_) : lc($_); # x as scratch
 198     $x =~ s/\W+//s;
 199     [$_, $x];
 200   }
 201   @_
 202 }
 203 EONSORT
 204
 205 #-----------------------------------------------------------------------------
 206 maker(<<'EONCMP');
 207 sub ncmp {
 208   # The guts are basically the same as above...
 209
 210   # get options:
 211   my($cmp, $lc);
 212   ($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
 213
 214   if(@_ == 0) {
 215     @_ = ($a, $b); # bit of a hack!
 216     DEBUG > 1 and print "Hacking in <$a><$b>\n";
 217   } elsif(@_ != 2) {
 218     require Carp;
 219     Carp::croak("Not enough options to ncmp!");
 220   }
 221   my($a,$b) = @_;
 222   my($x, $x2, $y, $y2, $rv);  # scratch vars
 223
 224   DEBUG > 1 and print "ncmp args <$a><$b>\n";
 225   if($a eq $b) { # trap this expensive case
 226     0;
 227   } else {
 228     $x = ($lc ? $lc->($a) : lc($a));
 229     $x =~ s/\W+//s;
 230     $y = ($lc ? $lc->($b) : lc($b));
 231     $y =~ s/\W+//s;
 232
 233 ~COMPARATOR~
 234
 235
 236     # Tiebreakers...
 237     DEBUG > 1 and print " -<$a> cmp <$b> is $rv ($ORD[$rv])\n";
 238     $rv ||= (length($x) <=> length($y))  # shorter is always first
 239         ||  ($cmp and $cmp->($x,$y) || $cmp->($a,$b))
 240         ||  ($x cmp $y)
 241         ||  ($a cmp $b)
 242     ;
 243
 244     DEBUG > 1 and print "  <$a> cmp <$b> is $rv\n";
 245     $rv;
 246   }
 247 }
 248 EONCMP
 249
 250 # clean up:
 251 undef $guts;
 252 undef &maker;
 253
 254 #-----------------------------------------------------------------------------
 255 1;
 256
 257 ############### END OF MAIN SOURCE ###########################################
 258 __END__
 259
 260 =head1 NAME
 261
 262 Sort::Naturally -- sort lexically, but sort numeral parts numerically
 263
 264 =head1 SYNOPSIS
 265
 266   @them = nsort(qw(
 267    foo12a foo12z foo13a foo 14 9x foo12 fooa foolio Foolio Foo12a
 268   ));
 269   print join(' ', @them), "\n";
 270
 271 Prints:
 272
 273   9x 14 foo fooa foolio Foolio foo12 foo12a Foo12a foo12z foo13a
 274
 275 (Or "foo12a" + "Foo12a" and "foolio" + "Foolio" and might be
 276 switched, depending on your locale.)
 277
 278 =head1 DESCRIPTION
 279
 280 This module exports two functions, C<nsort> and C<ncmp>; they are used
 281 in implementing my idea of a "natural sorting" algorithm.  Under natural
 282 sorting, numeric substrings are compared numerically, and other
 283 word-characters are compared lexically.
 284
 285 This is the way I define natural sorting:
 286
 287 =over
 288
 289 =item *
 290
 291 Non-numeric word-character substrings are sorted lexically,
 292 case-insensitively: "Foo" comes between "fish" and "fowl".
 293
 294 =item *
 295
 296 Numeric substrings are sorted numerically:
 297 "100" comes after "20", not before.
 298
 299 =item *
 300
 301 \W substrings (neither words-characters nor digits) are I<ignored>.
 302
 303 =item *
 304
 305 Our use of \w, \d, \D, and \W is locale-sensitive:  Sort::Naturally
 306 uses a C<use locale> statement.
 307
 308 =item *
 309
 310 When comparing two strings, where a numeric substring in one
 311 place is I<not> up against a numeric substring in another,
 312 the non-numeric always comes first.  This is fudged by
 313 reading pretending that the lack of a number substring has
 314 the value -1, like so:
 315
 316   foo       =>  "foo",  -1
 317   foobar    =>  "foo",  -1,  "bar"
 318   foo13     =>  "foo",  13,
 319   foo13xyz  =>  "foo",  13,  "xyz"
 320
 321 That's so that "foo" will come before "foo13", which will come
 322 before "foobar".
 323
 324 =item *
 325
 326 The start of a string is exceptional: leading non-\W (non-word,
 327 non-digit)
 328 components are are ignored, and numbers come I<before> letters.
 329
 330 =item *
 331
 332 I define "numeric substring" just as sequences matching m/\d+/ --
 333 scientific notation, commas, decimals, etc., are not seen.  If
 334 your data has thousands separators in numbers
 335 ("20,000 Leagues Under The Sea" or "20.000 lieues sous les mers"),
 336 consider stripping them before feeding them to C<nsort> or
 337 C<ncmp>.
 338
 339 =back
 340
 341 =head2 The nsort function
 342
 343 This function takes a list of strings, and returns a copy of the list,
 344 sorted.
 345
 346 This is what most people will want to use:
 347
 348   @stuff = nsort(...list...);
 349
 350 When nsort needs to compare non-numeric substrings, it
 351 uses Perl's C<lc> function in scope of a <use locale>.
 352 And when nsort needs to lowercase things, it uses Perl's
 353 C<lc> function in scope of a <use locale>.  If you want nsort
 354 to use other functions instead, you can specify them in
 355 an arrayref as the first argument to nsort:
 356
 357   @stuff = nsort( [
 358                     \&string_comparator,   # optional
 359                     \&lowercaser_function  # optional
 360                   ],
 361                   ...list...
 362                 );
 363
 364 If you want to specify a string comparator but no lowercaser,
 365 then the options list is C<[\&comparator, '']> or
 366 C<[\&comparator]>.  If you want to specify no string comparator
 367 but a lowercaser, then the options list is
 368 C<['', \&lowercaser]>.
 369
 370 Any comparator you specify is called as
 371 C<$comparator-E<gt>($left, $right)>,
 372 and, like a normal Perl C<cmp> replacement, must return
 373 -1, 0, or 1 depending on whether the left argument is stringwise
 374 less than, equal to, or greater than the right argument.
 375
 376 Any lowercaser function you specify is called as
 377 C<$lowercased = $lowercaser-E<gt>($original)>.  The routine
 378 must not modify its C<$_[0]>.
 379
 380 =head2 The ncmp function
 381
 382 Often, when sorting non-string values like this:
 383
 384    @objects_sorted = sort { $a->tag cmp $b->tag } @objects;
 385
 386 ...or even in a Schwartzian transform, like this:
 387
 388    @strings =
 389      map $_->[0]
 390      sort { $a->[1] cmp $b->[1] }
 391      map { [$_, make_a_sort_key_from($_) ]
 392      @_
 393    ;
 394
 395 ...you wight want something that replaces not C<sort>, but C<cmp>.
 396 That's what Sort::Naturally's C<ncmp> function is for.  Call it with
 397 the syntax C<ncmp($left,$right)> instead of C<$left cmp $right>,
 398 but otherwise it's a fine replacement:
 399
 400    @objects_sorted = sort { ncmp($a->tag,$b->tag) } @objects;
 401
 402    @strings =
 403      map $_->[0]
 404      sort { ncmp($a->[1], $b->[1]) }
 405      map { [$_, make_a_sort_key_from($_) ]
 406      @_
 407    ;
 408
 409 Just as with C<nsort> can take different a string-comparator
 410 and/or lowercaser, you can do the same with C<ncmp>, by passing
 411 an arrayref as the first argument:
 412
 413   ncmp( [
 414           \&string_comparator,   # optional
 415           \&lowercaser_function  # optional
 416         ],
 417         $left, $right
 418       )
 419
 420 You might get string comparators from L<Sort::ArbBiLex|Sort::ArbBiLex>.
 421
 422 =head1 NOTES
 423
 424 =over
 425
 426 =item *
 427
 428 This module is not a substitute for
 429 L<Sort::Versions|Sort::Versions>!  If
 430 you just need proper version sorting, use I<that!>
 431
 432 =item *
 433
 434 If you need something that works I<sort of> like this module's
 435 functions, but not quite the same, consider scouting thru this
 436 module's source code, and adapting what you see.  Besides
 437 the functions that actually compile in this module, after the POD,
 438 there's several alternate attempts of mine at natural sorting
 439 routines, which are not compiled as part of the module, but which you
 440 might find useful.  They should all be I<working> implementations of
 441 slightly different algorithms
 442 (all of them based on Martin Pool's C<nsort>) which I eventually
 443 discarded in favor of my algorithm.  If you are having to
 444 naturally-sort I<very large> data sets, and sorting is getting
 445 ridiculously slow, you might consider trying one of those
 446 discarded functions -- I have a feeling they might be faster on
 447 large data sets.  Benchmark them on your data and see.  (Unless
 448 you I<need> the speed, don't bother.  Hint: substitute C<sort>
 449 for C<nsort> in your code, and unless your program speeds up
 450 drastically, it's not the sorting that's slowing things down.
 451 But if it I<is> C<nsort> that's slowing things down, consider
 452 just:
 453
 454       if(@set >= SOME_VERY_BIG_NUMBER) {
 455         no locale; # vroom vroom
 456         @sorted = sort(@set);  # feh, good enough
 457       } elsif(@set >= SOME_BIG_NUMBER) {
 458         use locale;
 459         @sorted = sort(@set);  # feh, good enough
 460       } else {
 461         # but keep it pretty for normal cases
 462         @sorted = nsort(@set);
 463       }
 464
 465 =item *
 466
 467 If you do adapt the routines in this module, email me; I'd
 468 just be interested in hearing about it.
 469
 470 =item *
 471
 472 Thanks to the EFNet #perl people for encouraging this module,
 473 especially magister and a-mused.
 474
 475 =back
 476
 477 =head1 COPYRIGHT AND DISCLAIMER
 478
 479 Copyright 2001, Sean M. Burke C<sburke@cpan.org>, all rights
 480 reserved.  This program is free software; you can redistribute it
 481 and/or modify it under the same terms as Perl itself.
 482
 483 This program is distributed in the hope that it will be useful, but
 484 without any warranty; without even the implied warranty of
 485 merchantability or fitness for a particular purpose.
 486
 487 =head1 AUTHOR
 488
 489 Sean M. Burke C<sburke@cpan.org>
 490
 491 =cut
 492
 493 ############   END OF DOCS   ############
 494
 495 ############################################################################
 496 ############################################################################
 497
 498 ############ BEGIN OLD STUFF ############
 499
 500 # We can't have "use integer;", or else (5 <=> 5.1) comes out "0" !
 501
 502 #-----------------------------------------------------------------------------
 503 sub nsort {
 504   my($cmp, $lc);
 505   return @_ if @_ < 2;   # Just to be CLEVER.
 506
 507   my($x, $i);  # scratch vars
 508
 509   # And now, the GREAT BIG Schwartzian transform:
 510
 511   map
 512     $_->[0],
 513
 514   sort {
 515     # Uses $i as the index variable, $x as the result.
 516     $x = 0;
 517     $i = 1;
 518     DEBUG and print "\nComparing ", map("{$_}", @$a),
 519                  ' : ', map("{$_}", @$b), , "...\n";
 520
 521     while($i < @$a and $i < @$b) {
 522       DEBUG and print "  comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
 523         $a->[$i] cmp $b->[$i], "\n";
 524       last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
 525       ++$i;
 526
 527       DEBUG and print "  comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
 528         $a->[$i] <=> $b->[$i], "\n";
 529       last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
 530       ++$i;
 531     }
 532
 533     DEBUG and print "{$a->[0]} : {$b->[0]} is ",
 534       $x || (@$a <=> @$b) || 0
 535       ,"\n"
 536     ;
 537     $x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
 538       # unless we found a result for $x in the while loop,
 539       #  use length as a tiebreaker, otherwise use cmp
 540       #  on the original string as a fallback tiebreaker.
 541   }
 542
 543   map {
 544     my @bit = ($x = defined($_) ? $_ : '');
 545
 546     if($x =~ m/^[+-]?(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?\z/s) {
 547       # It's entirely purely numeric, so treat it specially:
 548       push @bit, '', $x;
 549     } else {
 550       # Consume the string.
 551       while(length $x) {
 552         push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
 553         push @bit, ($x =~ s/^(\d+)//s) ?    $1  :  0;
 554       }
 555     }
 556     DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
 557
 558     # End result: [original bit         , (text, number), (text, number), ...]
 559     # Minimally:  [0-length original bit,]
 560     # Examples:
 561     #    ['10'         => ''   ,  10,              ]
 562     #    ['fo900'      => 'fo' , 900,              ]
 563     #    ['foo10'      => 'foo',  10,              ]
 564     #    ['foo9.pl'    => 'foo',   9,   , '.pl', 0 ]
 565     #    ['foo32.pl'   => 'foo',  32,   , '.pl', 0 ]
 566     #    ['foo325.pl'  => 'foo', 325,   , '.pl', 0 ]
 567     #  Yes, always an ODD number of elements.
 568
 569     \@bit;
 570   }
 571   @_;
 572 }
 573
 574 #-----------------------------------------------------------------------------
 575 # Same as before, except without the pure-number trap.
 576
 577 sub nsorts {
 578   return @_ if @_ < 2;   # Just to be CLEVER.
 579
 580   my($x, $i);  # scratch vars
 581
 582   # And now, the GREAT BIG Schwartzian transform:
 583
 584   map
 585     $_->[0],
 586
 587   sort {
 588     # Uses $i as the index variable, $x as the result.
 589     $x = 0;
 590     $i = 1;
 591     DEBUG and print "\nComparing ", map("{$_}", @$a),
 592                  ' : ', map("{$_}", @$b), , "...\n";
 593
 594     while($i < @$a and $i < @$b) {
 595       DEBUG and print "  comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
 596         $a->[$i] cmp $b->[$i], "\n";
 597       last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
 598       ++$i;
 599
 600       DEBUG and print "  comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
 601         $a->[$i] <=> $b->[$i], "\n";
 602       last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
 603       ++$i;
 604     }
 605
 606     DEBUG and print "{$a->[0]} : {$b->[0]} is ",
 607       $x || (@$a <=> @$b) || 0
 608       ,"\n"
 609     ;
 610     $x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
 611       # unless we found a result for $x in the while loop,
 612       #  use length as a tiebreaker, otherwise use cmp
 613       #  on the original string as a fallback tiebreaker.
 614   }
 615
 616   map {
 617     my @bit = ($x = defined($_) ? $_ : '');
 618
 619     while(length $x) {
 620       push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
 621       push @bit, ($x =~ s/^(\d+)//s) ?    $1  :  0;
 622     }
 623     DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
 624
 625     # End result: [original bit         , (text, number), (text, number), ...]
 626     # Minimally:  [0-length original bit,]
 627     # Examples:
 628     #    ['10'         => ''   ,  10,              ]
 629     #    ['fo900'      => 'fo' , 900,              ]
 630     #    ['foo10'      => 'foo',  10,              ]
 631     #    ['foo9.pl'    => 'foo',   9,   , '.pl', 0 ]
 632     #    ['foo32.pl'   => 'foo',  32,   , '.pl', 0 ]
 633     #    ['foo325.pl'  => 'foo', 325,   , '.pl', 0 ]
 634     #  Yes, always an ODD number of elements.
 635
 636     \@bit;
 637   }
 638   @_;
 639 }
 640
 641 #-----------------------------------------------------------------------------
 642 # Same as before, except for the sort-key-making
 643
 644 sub nsort0 {
 645   return @_ if @_ < 2;   # Just to be CLEVER.
 646
 647   my($x, $i);  # scratch vars
 648
 649   # And now, the GREAT BIG Schwartzian transform:
 650
 651   map
 652     $_->[0],
 653
 654   sort {
 655     # Uses $i as the index variable, $x as the result.
 656     $x = 0;
 657     $i = 1;
 658     DEBUG and print "\nComparing ", map("{$_}", @$a),
 659                  ' : ', map("{$_}", @$b), , "...\n";
 660
 661     while($i < @$a and $i < @$b) {
 662       DEBUG and print "  comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
 663         $a->[$i] cmp $b->[$i], "\n";
 664       last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
 665       ++$i;
 666
 667       DEBUG and print "  comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
 668         $a->[$i] <=> $b->[$i], "\n";
 669       last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
 670       ++$i;
 671     }
 672
 673     DEBUG and print "{$a->[0]} : {$b->[0]} is ",
 674       $x || (@$a <=> @$b) || 0
 675       ,"\n"
 676     ;
 677     $x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
 678       # unless we found a result for $x in the while loop,
 679       #  use length as a tiebreaker, otherwise use cmp
 680       #  on the original string as a fallback tiebreaker.
 681   }
 682
 683   map {
 684     my @bit = ($x = defined($_) ? $_ : '');
 685
 686     if($x =~ m/^[+-]?(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?\z/s) {
 687       # It's entirely purely numeric, so treat it specially:
 688       push @bit, '', $x;
 689     } else {
 690       # Consume the string.
 691       while(length $x) {
 692         push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
 693         # Secret sauce:
 694         if($x =~ s/^(\d+)//s) {
 695           if(substr($1,0,1) eq '0' and $1 != 0) {
 696             push @bit, $1 / (10 ** length($1));
 697           } else {
 698             push @bit, $1;
 699           }
 700         } else {
 701           push @bit, 0;
 702         }
 703       }
 704     }
 705     DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
 706
 707     \@bit;
 708   }
 709   @_;
 710 }
 711
 712 #-----------------------------------------------------------------------------
 713 # Like nsort0, but WITHOUT pure number handling, and WITH special treatment
 714 # of pulling off extensions and version numbers.
 715
 716 sub nsortf {
 717   return @_ if @_ < 2;   # Just to be CLEVER.
 718
 719   my($x, $i);  # scratch vars
 720
 721   # And now, the GREAT BIG Schwartzian transform:
 722
 723   map
 724     $_->[0],
 725
 726   sort {
 727     # Uses $i as the index variable, $x as the result.
 728     $x = 0;
 729     $i = 3;
 730     DEBUG and print "\nComparing ", map("{$_}", @$a),
 731                  ' : ', map("{$_}", @$b), , "...\n";
 732
 733     while($i < @$a and $i < @$b) {
 734       DEBUG and print "  comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
 735         $a->[$i] cmp $b->[$i], "\n";
 736       last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
 737       ++$i;
 738
 739       DEBUG and print "  comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
 740         $a->[$i] <=> $b->[$i], "\n";
 741       last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
 742       ++$i;
 743     }
 744
 745     DEBUG and print "{$a->[0]} : {$b->[0]} is ",
 746       $x || (@$a <=> @$b) || 0
 747       ,"\n"
 748     ;
 749     $x || (@$a     <=> @$b    ) || ($a->[1] cmp $b->[1])
 750        || ($a->[2] <=> $b->[2]) || ($a->[0] cmp $b->[0]);
 751       # unless we found a result for $x in the while loop,
 752       #  use length as a tiebreaker, otherwise use the
 753       #  lc'd extension, otherwise the verison, otherwise use
 754       #  the original string as a fallback tiebreaker.
 755   }
 756
 757   map {
 758     my @bit = ( ($x = defined($_) ? $_ : ''), '',0 );
 759
 760     {
 761       # Consume the string.
 762
 763       # First, pull off any VAX-style version
 764       $bit[2] = $1 if $x =~ s/;(\d+)$//;
 765
 766       # Then pull off any apparent extension
 767       if( $x !~ m/^\.+$/s and     # don't mangle ".", "..", or "..."
 768           $x =~ s/(\.[^\.\;]*)$//sg
 769           # We could try to avoid catching all-digit extensions,
 770           #  but I think that's getting /too/ clever.
 771       ) {
 772         $i = $1;
 773         if($x =~ m<[^\\\://]$>s) {
 774           # We didn't take the whole basename.
 775           $bit[1] = lc $i;
 776           DEBUG and print "Consuming extension \"$1\"\n";
 777         } else {
 778           # We DID take the whole basename.  Fix it.
 779           $x = $1;  # Repair it.
 780         }
 781       }
 782
 783       push @bit, '', -1   if $x =~ m/^\./s;
 784        # A hack to make .-initial filenames sort first, regardless of locale.
 785        # And -1 is always a sort-firster, since in the code below, there's
 786        # no allowance for filenames containing negative numbers: -1.dat
 787        # will be read as string '-' followed by number 1.
 788
 789       while(length $x) {
 790         push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
 791         # Secret sauce:
 792         if($x =~ s/^(\d+)//s) {
 793           if(substr($1,0,1) eq '0' and $1 != 0) {
 794             push @bit, $1 / (10 ** length($1));
 795           } else {
 796             push @bit, $1;
 797           }
 798         } else {
 799           push @bit, 0;
 800         }
 801       }
 802     }
 803
 804     DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
 805
 806     \@bit;
 807   }
 808   @_;
 809 }
 810
 811 # yowza yowza yowza.
 812