#!/usr/local/bin/perl $testfile = "c:\\wmuzimmer\\democall.lst"; $fopen = sprintf("Cannot open file %s for input\n", $testfile); open(testfile, $testfile) or die $fopen; @testline = ; close(testfile); chomp @testline; $idx = 0; while ($idx < @testline) { @cnparts = cnparse($testline[$idx]); for ($j=0; $j<37; $j++) {$raw[$idx] .= sprintf("%-22.22s`", $cnparts[$j]);} $raw[$idx] .= $testline[$idx] . "\n"; printf("call#>>%s\n", $testline[$idx]); $tidx = 0; while ($tidx < @cnparts) { if (length($cnparts[$tidx]) > 0) {printf("%d>>%s\n", $tidx, $cnparts[$tidx]);} $tidx++; } $idx++; } @sorted = sort{$a cmp $b} @raw; printf ("\nUNSORTED\n"); $idx = 0; while ($idx < @testline) { printf ("%s\n", $testline[$idx]); $idx++; } chomp @sorted; printf ("\nSORTED\n"); $idx = 0; while ($idx < @sorted) { @temp = split /`/, $sorted[$idx]; printf ("%s\n", $temp[37]); $idx++; } sub cnparse ### input: callnum - contains callnumber ### output: cnpart[0..36] - call number parse elements suitable for smart sorting ### usage: @callnumber_parts_array = cnparse($callnumber) ### you can then incorporate the callnumber parts at the beginning of ### the "record" for sorting purposes. sort results will not be perfect, ### but will be very close (in my experience), and better than any other ### callnumber sortings that I have seen. ### ### various character check routines are called as subroutines, ### rather than inlined, for clarity of code ### the following are considered to be separators within the callnumber: ### space " ", semicolon ";", colon ":", comma ",", period ".", forward slash "/" ### all resulting call number parts, no matter how "full", are of equal length. ### ### written by Roy Zimmer, Western Michigan University (for Waldo Library) ### versions ### 1.0 1990s initial version, ported from SAS on mainframe when used with NOTIS ### 1.1 2002 removed old garbage and improved code { my ($callnum) = @_; my $curralpha = 0; my $currnum = 0; my $charidx = 0; my $period = "."; my $space = " "; my $semicolon = ";"; my ($initidx, $callnumlength, $isalpha, $isnum, $isalphanum); my ($isseparator, $callnumpartlength, $currchar, $nextchar); my $parselength = 22; my $callnumpartidx = 0; my $callnumpartcharidx = -1; my $hadperiod = 0; my ($stringcheck, $idx, $callnumpart); my @cnpart; for $initidx (0..36) {$cnpart[$initidx] = "";} # remove any leading (meaningless) separators while (sepcheck(substr($callnum, 0, 1))) {$callnum = substr($callnum, 1);} $callnumlength = length($callnum) - 1; while ($charidx <= $callnumlength) { $currchar = uc(substr($callnum, $charidx, 1)); if ($callnumpartidx == 1) { if (alphacheck($cnpart[0]) && numcheck($cnpart[1]) && $currchar eq $space && substr($callnum, $charidx+1, 1) ne $period) { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); $cnpart[$callnumpartidx] = ".0"; } } $isseparator = sepcheck($currchar); if ($currchar eq $period) {$hadperiod = 1;} if (!$isseparator) { $isalpha = alphacheck($currchar); $isnum = numcheck($currchar); if (($curralpha && $isalpha) || ($currnum && $isnum) || (!$isalpha && !$isnum)) { $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); } if (!$curralpha && !$currnum) { $curralpha = $isalpha; $currnum = $isnum; $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); } if (($curralpha && $isnum) || ($currnum && $isalpha)) { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); if ($curralpha && $isnum) { $callnumpartcharidx = storechar($period, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); } $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); $curralpha = $isalpha; $currnum = $isnum; } } else # isseparator { if ($charidx < $callnumlength) { $nextchar = substr($callnum, $charidx+1, 1); if (($currchar eq $period) && ($nextchar eq $semicolon)) { $charidx++; $nextchar = substr($callnum, $charidx+1, 1); } $isseparator = sepcheck($nextchar); if ($isseparator) { # nextchar = . and currchar = semicolon, comma, or space if (($currchar =~ /[;, ]/) and ($nextchar eq $period)) { $hadperiod = 1; ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); $callnumpartcharidx = storechar($nextchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); $curralpha = $currnum = 0; $charidx++; } if (($currchar eq $period) && ($nextchar ne $space)) { $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); $charidx++; } } # if isseparator else # have a separator and nextchar is not a separator { if ($currchar eq $period) { if (!$curralpha) { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); } $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); if ($curralpha) { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); } } else { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); } $curralpha = $currnum = 0; } } # charidx < callnumlength } # isseparator $charidx++; } # do while charidx <= callnumlength # pad parse groups as necessary for sorting for ($idx=0; $idx<=$callnumpartidx; $idx++) { $callnumpart = $cnpart[$idx]; $callnumpartlength = length($callnumpart) - 1; $isnum = numcheck($callnumpart); # all digits if ($isnum) { # right/left justify numeric cnpart, as required $callnumpart = adjcpnum($idx, $callnumpart, $parselength, $callnumpartlength, \@cnpart); } else { $stringcheck = substr($callnumpart, 0, 1); $isalphanum = alphanumcheck($stringcheck); if ($isalphanum && ($callnumpartlength > 0)) { $stringcheck = substr($callnumpart, 0, $callnumpartlength-1); $isnum = numcheck($stringcheck); $stringcheck = substr($callnumpart, $callnumpartlength, 1); $isalphanum = alphanumcheck($stringcheck); # leading digits, trailing single non-alphanumeric if (!$isalphanum && $isnum) { $callnumpart = adjcpnum($idx, $callnumpart, $parselength, $callnumpartlength, \@cnpart); } } } $cnpart[$idx] = $callnumpart; } return @cnpart; } sub storechar ### the special handling of the period character allows ### numeric chunks parsed out well after the initial ### accompanying period to be correctly treated as part ### of a decimal number. { my ($achar, $callnumpartidx, $callnumpartcharidx, $hadperiod, $cnpart) = @_; my $period = "."; $callnumpartcharidx++; if (length($cnpart->[$callnumpartidx]) == 0) {$callnumpartcharidx = 0;} if ($achar ne $period) { substr($cnpart->[$callnumpartidx], $callnumpartcharidx, 1) = $achar; } else { if (($callnumpartcharidx == 0) && $hadperiod) { substr($cnpart->[$callnumpartidx], $callnumpartcharidx, 1) = $achar; } } return $callnumpartcharidx; } sub adjcpnum ### code macro ### assumed that current cnpart is numeric ### we don't do anything until we're at least on the 4th cnpart ### if previous cnpart is not empty and starts with a letter, ### look at the cnpart before that (current-2) and ### if that one is all numeric, ### prepend a period to the current cnpart ### if none of the above applies, ### pad the current all-numeric cnpart with leading zeros { my ($idx, $callnumpart, $parselength, $callnumpartlength, $cnpart) = @_; my $doneit = 0; my ($isalphaprevchunkbeg, $stringcheck); my $period = "."; if ($idx > 2) { $stringcheck = $cnpart->[$idx-1]; # in case preceded by period $isalphaprevchunkbeg = 0; if (length($stringcheck) > 0) { $stringcheck = substr($stringcheck, 1); $isalphaprevchunkbeg = alphacheck($stringcheck); } if ($isalphaprevchunkbeg) { $stringcheck = $cnpart->[$idx-2]; if (numcheck($stringcheck)) { $callnumpart = $period . $callnumpart; $doneit = 1; } } } if (!$doneit) { $callnumpart = (sprintf ("0" x ($parselength-$callnumpartlength-1))) . $callnumpart; } } sub upcallnumpartidx ### code macro { my ($callnumpartidx, $callnumpartcharidx) = @_; $callnumpartidx++; $callnumpartcharidx = -1; return ($callnumpartidx, $callnumpartcharidx); } sub numcheck ### check input string ### returns: 1 if contains only digits ### 0 if contains at least one other character or if empty string { my ($stringin) = @_; if ($stringin =~ /^[0-9]+$/) {return 1;} else {return 0;} } sub alphacheck ### check input string ### returns: 1 if contains only letters ### 0 if contains at least one other character or if empty string { my ($stringin) = @_; if ($stringin =~ /^[A-Za-z]+$/) {return 1;} else {return 0;} } sub alphanumcheck ### check input string ### returns: 1 if contains only letters and digits ### 0 if contains at least one other character or if empty string { my ($stringin) = @_; if ($stringin =~ /^[a-zA-Z0-9]+$/) {return 1;} else {return 0;} } sub sepcheck ### check call number character ### returns: 1 if is a section separator ### 0 if not { my ($cnchar) = @_; # separator chars: space, semicolon, colon, comma, period, forward slash if ($cnchar =~ /[ ;:,.\/]/) {return 1;} else {return 0;} }