3 package Sort::Naturally; # Time-stamp: "2004-12-29 18:30:03 AST"
5 @EXPORT = ('nsort', 'ncmp');
13 #-----------------------------------------------------------------------------
15 BEGIN { *DEBUG = sub () {0} unless defined &DEBUG }
19 # Make a constant such that if a whole-number string is that long
20 # or shorter, we KNOW it's treatable as an integer
22 my $x = length(256 ** $Config::Config{'intsize'} / 2) - 1;
23 die "Crazy intsize: <$Config::Config{'intsize'}>" if $x < 4;
24 eval 'sub MAX_INT_SIZE () {' . $x . '}';
26 print "intsize $Config::Config{'intsize'} => MAX_INT_SIZE $x\n" if DEBUG;
32 my @ORD = ('same', 'swap', 'asis');
34 #-----------------------------------------------------------------------------
35 # For lack of a preprocessor:
38 $guts = <<'EOGUTS'; # This is the guts of both ncmp and nsort:
41 # trap this expensive case first, and then fall thru to tiebreaker
44 # Convoluted hack to get numerics to sort first, at string start:
45 } elsif($x =~ m/^\d/s) {
47 $rv = 0; # fall thru to normal comparison for the two numbers
50 DEBUG > 1 and print "Numeric-initial $x trumps letter-initial $y\n";
52 } elsif($y =~ m/^\d/s) {
54 DEBUG > 1 and print "Numeric-initial $y trumps letter-initial $x\n";
62 DEBUG and print "<$x> and <$y> compared...\n";
65 while(length $x and length $y) {
67 DEBUG > 2 and print " <$x> and <$y>...\n";
69 # First, non-numeric comparison:
70 $x2 = ($x =~ m/^(\D+)/s) ? length($1) : 0;
71 $y2 = ($y =~ m/^(\D+)/s) ? length($1) : 0;
72 # Now make x2 the min length of the two:
73 $x2 = $y2 if $x2 > $y2;
75 DEBUG > 1 and printf " <%s> and <%s> lexically for length $x2...\n",
76 substr($x,0,$x2), substr($y,0,$x2);
78 my $i = substr($x,0,$x2);
79 my $j = substr($y,0,$x2);
81 print "SCREAM! on <$i><$j> -- $sv != $rv \n" unless $rv == $sv;
87 # The ''. things here force a copy that seems to work around a
88 # mysterious intermittent bug that 'use locale' provokes in
89 # many versions of Perl.
91 ? $cmp->(substr($x,0,$x2) . '',
92 substr($y,0,$x2) . '',
95 scalar(( substr($x,0,$x2) . '' ) cmp
96 ( substr($y,0,$x2) . '' )
99 # otherwise trim and keep going:
100 substr($x,0,$x2) = '';
101 substr($y,0,$x2) = '';
105 # (actually just using $x2 and $y2 as scratch)
107 if( $x =~ s/^(\d+)//s ) {
109 if( $y =~ s/^(\d+)//s ) {
110 # We have two numbers here.
111 DEBUG > 1 and print " <$x2> and <$1> numerically\n";
112 if(length($x2) < MAX_INT_SIZE and length($1) < MAX_INT_SIZE) {
113 # small numbers: we can compare happily
114 last if $rv = $x2 <=> $1;
116 # ARBITRARILY large integers!
118 # This saves on loss of precision that could happen
119 # with actual stringification.
120 # Also, I sense that very large numbers aren't too
121 # terribly common in sort data.
124 ($y2 = $1) =~ s/^0+//s;
126 print " Treating $x2 and $y2 as bigint\n" if DEBUG;
128 no locale; # we want the dumb cmp back.
130 # works only for non-negative whole numbers:
131 length($x2) <=> length($y2)
132 # the longer the numeral, the larger the value
134 # between equals, compare lexically!! amazing but true.
138 # X is numeric but Y isn't
142 } elsif( $y =~ s/^\d+//s ) { # we don't need to capture the substring
146 # else one of them is 0-length.
155 $code =~ s/~COMPARATOR~/$guts/g || die "Can't find ~COMPARATOR~";
160 ##############################################################################
166 ($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
168 return @_ unless @_ > 1 or wantarray; # be clever
170 my($x, $x2, $y, $y2, $rv); # scratch vars
172 # We use a Schwartzian xform to memoize the lc'ing and \W-removal
176 if($a->[0] eq $b->[0]) { 0 } # trap this expensive case
185 DEBUG > 1 and print " -<${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
186 $rv ||= (length($x) <=> length($y)) # shorter is always first
187 || ($cmp and $cmp->($x,$y) || $cmp->($a->[0], $b->[0]))
189 || ($a->[0] cmp $b->[0])
192 DEBUG > 1 and print " <${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n";
197 $x = $lc ? $lc->($_) : lc($_); # x as scratch
205 #-----------------------------------------------------------------------------
208 # The guts are basically the same as above...
212 ($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY';
215 @_ = ($a, $b); # bit of a hack!
216 DEBUG > 1 and print "Hacking in <$a><$b>\n";
219 Carp::croak("Not enough options to ncmp!");
222 my($x, $x2, $y, $y2, $rv); # scratch vars
224 DEBUG > 1 and print "ncmp args <$a><$b>\n";
225 if($a eq $b) { # trap this expensive case
228 $x = ($lc ? $lc->($a) : lc($a));
230 $y = ($lc ? $lc->($b) : lc($b));
237 DEBUG > 1 and print " -<$a> cmp <$b> is $rv ($ORD[$rv])\n";
238 $rv ||= (length($x) <=> length($y)) # shorter is always first
239 || ($cmp and $cmp->($x,$y) || $cmp->($a,$b))
244 DEBUG > 1 and print " <$a> cmp <$b> is $rv\n";
254 #-----------------------------------------------------------------------------
257 ############### END OF MAIN SOURCE ###########################################
262 Sort::Naturally -- sort lexically, but sort numeral parts numerically
267 foo12a foo12z foo13a foo 14 9x foo12 fooa foolio Foolio Foo12a
269 print join(' ', @them), "\n";
273 9x 14 foo fooa foolio Foolio foo12 foo12a Foo12a foo12z foo13a
275 (Or "foo12a" + "Foo12a" and "foolio" + "Foolio" and might be
276 switched, depending on your locale.)
280 This module exports two functions, C<nsort> and C<ncmp>; they are used
281 in implementing my idea of a "natural sorting" algorithm. Under natural
282 sorting, numeric substrings are compared numerically, and other
283 word-characters are compared lexically.
285 This is the way I define natural sorting:
291 Non-numeric word-character substrings are sorted lexically,
292 case-insensitively: "Foo" comes between "fish" and "fowl".
296 Numeric substrings are sorted numerically:
297 "100" comes after "20", not before.
301 \W substrings (neither words-characters nor digits) are I<ignored>.
305 Our use of \w, \d, \D, and \W is locale-sensitive: Sort::Naturally
306 uses a C<use locale> statement.
310 When comparing two strings, where a numeric substring in one
311 place is I<not> up against a numeric substring in another,
312 the non-numeric always comes first. This is fudged by
313 reading pretending that the lack of a number substring has
314 the value -1, like so:
317 foobar => "foo", -1, "bar"
319 foo13xyz => "foo", 13, "xyz"
321 That's so that "foo" will come before "foo13", which will come
326 The start of a string is exceptional: leading non-\W (non-word,
328 components are are ignored, and numbers come I<before> letters.
332 I define "numeric substring" just as sequences matching m/\d+/ --
333 scientific notation, commas, decimals, etc., are not seen. If
334 your data has thousands separators in numbers
335 ("20,000 Leagues Under The Sea" or "20.000 lieues sous les mers"),
336 consider stripping them before feeding them to C<nsort> or
341 =head2 The nsort function
343 This function takes a list of strings, and returns a copy of the list,
346 This is what most people will want to use:
348 @stuff = nsort(...list...);
350 When nsort needs to compare non-numeric substrings, it
351 uses Perl's C<lc> function in scope of a <use locale>.
352 And when nsort needs to lowercase things, it uses Perl's
353 C<lc> function in scope of a <use locale>. If you want nsort
354 to use other functions instead, you can specify them in
355 an arrayref as the first argument to nsort:
358 \&string_comparator, # optional
359 \&lowercaser_function # optional
364 If you want to specify a string comparator but no lowercaser,
365 then the options list is C<[\&comparator, '']> or
366 C<[\&comparator]>. If you want to specify no string comparator
367 but a lowercaser, then the options list is
368 C<['', \&lowercaser]>.
370 Any comparator you specify is called as
371 C<$comparator-E<gt>($left, $right)>,
372 and, like a normal Perl C<cmp> replacement, must return
373 -1, 0, or 1 depending on whether the left argument is stringwise
374 less than, equal to, or greater than the right argument.
376 Any lowercaser function you specify is called as
377 C<$lowercased = $lowercaser-E<gt>($original)>. The routine
378 must not modify its C<$_[0]>.
380 =head2 The ncmp function
382 Often, when sorting non-string values like this:
384 @objects_sorted = sort { $a->tag cmp $b->tag } @objects;
386 ...or even in a Schwartzian transform, like this:
390 sort { $a->[1] cmp $b->[1] }
391 map { [$_, make_a_sort_key_from($_) ]
395 ...you wight want something that replaces not C<sort>, but C<cmp>.
396 That's what Sort::Naturally's C<ncmp> function is for. Call it with
397 the syntax C<ncmp($left,$right)> instead of C<$left cmp $right>,
398 but otherwise it's a fine replacement:
400 @objects_sorted = sort { ncmp($a->tag,$b->tag) } @objects;
404 sort { ncmp($a->[1], $b->[1]) }
405 map { [$_, make_a_sort_key_from($_) ]
409 Just as with C<nsort> can take different a string-comparator
410 and/or lowercaser, you can do the same with C<ncmp>, by passing
411 an arrayref as the first argument:
414 \&string_comparator, # optional
415 \&lowercaser_function # optional
420 You might get string comparators from L<Sort::ArbBiLex|Sort::ArbBiLex>.
428 This module is not a substitute for
429 L<Sort::Versions|Sort::Versions>! If
430 you just need proper version sorting, use I<that!>
434 If you need something that works I<sort of> like this module's
435 functions, but not quite the same, consider scouting thru this
436 module's source code, and adapting what you see. Besides
437 the functions that actually compile in this module, after the POD,
438 there's several alternate attempts of mine at natural sorting
439 routines, which are not compiled as part of the module, but which you
440 might find useful. They should all be I<working> implementations of
441 slightly different algorithms
442 (all of them based on Martin Pool's C<nsort>) which I eventually
443 discarded in favor of my algorithm. If you are having to
444 naturally-sort I<very large> data sets, and sorting is getting
445 ridiculously slow, you might consider trying one of those
446 discarded functions -- I have a feeling they might be faster on
447 large data sets. Benchmark them on your data and see. (Unless
448 you I<need> the speed, don't bother. Hint: substitute C<sort>
449 for C<nsort> in your code, and unless your program speeds up
450 drastically, it's not the sorting that's slowing things down.
451 But if it I<is> C<nsort> that's slowing things down, consider
454 if(@set >= SOME_VERY_BIG_NUMBER) {
455 no locale; # vroom vroom
456 @sorted = sort(@set); # feh, good enough
457 } elsif(@set >= SOME_BIG_NUMBER) {
459 @sorted = sort(@set); # feh, good enough
461 # but keep it pretty for normal cases
462 @sorted = nsort(@set);
467 If you do adapt the routines in this module, email me; I'd
468 just be interested in hearing about it.
472 Thanks to the EFNet #perl people for encouraging this module,
473 especially magister and a-mused.
477 =head1 COPYRIGHT AND DISCLAIMER
479 Copyright 2001, Sean M. Burke C<sburke@cpan.org>, all rights
480 reserved. This program is free software; you can redistribute it
481 and/or modify it under the same terms as Perl itself.
483 This program is distributed in the hope that it will be useful, but
484 without any warranty; without even the implied warranty of
485 merchantability or fitness for a particular purpose.
489 Sean M. Burke C<sburke@cpan.org>
493 ############ END OF DOCS ############
495 ############################################################################
496 ############################################################################
498 ############ BEGIN OLD STUFF ############
500 # We can't have "use integer;", or else (5 <=> 5.1) comes out "0" !
502 #-----------------------------------------------------------------------------
505 return @_ if @_ < 2; # Just to be CLEVER.
507 my($x, $i); # scratch vars
509 # And now, the GREAT BIG Schwartzian transform:
515 # Uses $i as the index variable, $x as the result.
518 DEBUG and print "\nComparing ", map("{$_}", @$a),
519 ' : ', map("{$_}", @$b), , "...\n";
521 while($i < @$a and $i < @$b) {
522 DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
523 $a->[$i] cmp $b->[$i], "\n";
524 last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
527 DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
528 $a->[$i] <=> $b->[$i], "\n";
529 last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
533 DEBUG and print "{$a->[0]} : {$b->[0]} is ",
534 $x || (@$a <=> @$b) || 0
537 $x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
538 # unless we found a result for $x in the while loop,
539 # use length as a tiebreaker, otherwise use cmp
540 # on the original string as a fallback tiebreaker.
544 my @bit = ($x = defined($_) ? $_ : '');
546 if($x =~ m/^[+-]?(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?\z/s) {
547 # It's entirely purely numeric, so treat it specially:
550 # Consume the string.
552 push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
553 push @bit, ($x =~ s/^(\d+)//s) ? $1 : 0;
556 DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
558 # End result: [original bit , (text, number), (text, number), ...]
559 # Minimally: [0-length original bit,]
561 # ['10' => '' , 10, ]
562 # ['fo900' => 'fo' , 900, ]
563 # ['foo10' => 'foo', 10, ]
564 # ['foo9.pl' => 'foo', 9, , '.pl', 0 ]
565 # ['foo32.pl' => 'foo', 32, , '.pl', 0 ]
566 # ['foo325.pl' => 'foo', 325, , '.pl', 0 ]
567 # Yes, always an ODD number of elements.
574 #-----------------------------------------------------------------------------
575 # Same as before, except without the pure-number trap.
578 return @_ if @_ < 2; # Just to be CLEVER.
580 my($x, $i); # scratch vars
582 # And now, the GREAT BIG Schwartzian transform:
588 # Uses $i as the index variable, $x as the result.
591 DEBUG and print "\nComparing ", map("{$_}", @$a),
592 ' : ', map("{$_}", @$b), , "...\n";
594 while($i < @$a and $i < @$b) {
595 DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
596 $a->[$i] cmp $b->[$i], "\n";
597 last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
600 DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
601 $a->[$i] <=> $b->[$i], "\n";
602 last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
606 DEBUG and print "{$a->[0]} : {$b->[0]} is ",
607 $x || (@$a <=> @$b) || 0
610 $x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
611 # unless we found a result for $x in the while loop,
612 # use length as a tiebreaker, otherwise use cmp
613 # on the original string as a fallback tiebreaker.
617 my @bit = ($x = defined($_) ? $_ : '');
620 push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
621 push @bit, ($x =~ s/^(\d+)//s) ? $1 : 0;
623 DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
625 # End result: [original bit , (text, number), (text, number), ...]
626 # Minimally: [0-length original bit,]
628 # ['10' => '' , 10, ]
629 # ['fo900' => 'fo' , 900, ]
630 # ['foo10' => 'foo', 10, ]
631 # ['foo9.pl' => 'foo', 9, , '.pl', 0 ]
632 # ['foo32.pl' => 'foo', 32, , '.pl', 0 ]
633 # ['foo325.pl' => 'foo', 325, , '.pl', 0 ]
634 # Yes, always an ODD number of elements.
641 #-----------------------------------------------------------------------------
642 # Same as before, except for the sort-key-making
645 return @_ if @_ < 2; # Just to be CLEVER.
647 my($x, $i); # scratch vars
649 # And now, the GREAT BIG Schwartzian transform:
655 # Uses $i as the index variable, $x as the result.
658 DEBUG and print "\nComparing ", map("{$_}", @$a),
659 ' : ', map("{$_}", @$b), , "...\n";
661 while($i < @$a and $i < @$b) {
662 DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
663 $a->[$i] cmp $b->[$i], "\n";
664 last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
667 DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
668 $a->[$i] <=> $b->[$i], "\n";
669 last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
673 DEBUG and print "{$a->[0]} : {$b->[0]} is ",
674 $x || (@$a <=> @$b) || 0
677 $x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]);
678 # unless we found a result for $x in the while loop,
679 # use length as a tiebreaker, otherwise use cmp
680 # on the original string as a fallback tiebreaker.
684 my @bit = ($x = defined($_) ? $_ : '');
686 if($x =~ m/^[+-]?(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?\z/s) {
687 # It's entirely purely numeric, so treat it specially:
690 # Consume the string.
692 push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
694 if($x =~ s/^(\d+)//s) {
695 if(substr($1,0,1) eq '0' and $1 != 0) {
696 push @bit, $1 / (10 ** length($1));
705 DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";
712 #-----------------------------------------------------------------------------
713 # Like nsort0, but WITHOUT pure number handling, and WITH special treatment
714 # of pulling off extensions and version numbers.
717 return @_ if @_ < 2; # Just to be CLEVER.
719 my($x, $i); # scratch vars
721 # And now, the GREAT BIG Schwartzian transform:
727 # Uses $i as the index variable, $x as the result.
730 DEBUG and print "\nComparing ", map("{$_}", @$a),
731 ' : ', map("{$_}", @$b), , "...\n";
733 while($i < @$a and $i < @$b) {
734 DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ",
735 $a->[$i] cmp $b->[$i], "\n";
736 last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic
739 DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ",
740 $a->[$i] <=> $b->[$i], "\n";
741 last if ($x = ($a->[$i] <=> $b->[$i])); # numeric
745 DEBUG and print "{$a->[0]} : {$b->[0]} is ",
746 $x || (@$a <=> @$b) || 0
749 $x || (@$a <=> @$b ) || ($a->[1] cmp $b->[1])
750 || ($a->[2] <=> $b->[2]) || ($a->[0] cmp $b->[0]);
751 # unless we found a result for $x in the while loop,
752 # use length as a tiebreaker, otherwise use the
753 # lc'd extension, otherwise the verison, otherwise use
754 # the original string as a fallback tiebreaker.
758 my @bit = ( ($x = defined($_) ? $_ : ''), '',0 );
761 # Consume the string.
763 # First, pull off any VAX-style version
764 $bit[2] = $1 if $x =~ s/;(\d+)$//;
766 # Then pull off any apparent extension
767 if( $x !~ m/^\.+$/s and # don't mangle ".", "..", or "..."
768 $x =~ s/(\.[^\.\;]*)$//sg
769 # We could try to avoid catching all-digit extensions,
770 # but I think that's getting /too/ clever.
773 if($x =~ m<[^\\\://]$>s) {
774 # We didn't take the whole basename.
776 DEBUG and print "Consuming extension \"$1\"\n";
778 # We DID take the whole basename. Fix it.
779 $x = $1; # Repair it.
783 push @bit, '', -1 if $x =~ m/^\./s;
784 # A hack to make .-initial filenames sort first, regardless of locale.
785 # And -1 is always a sort-firster, since in the code below, there's
786 # no allowance for filenames containing negative numbers: -1.dat
787 # will be read as string '-' followed by number 1.
790 push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : '';
792 if($x =~ s/^(\d+)//s) {
793 if(substr($1,0,1) eq '0' and $1 != 0) {
794 push @bit, $1 / (10 ** length($1));
804 DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n";