my $PERIOD = '.'; my $SEMICOLON = ';'; my $SPACE = ' '; use constant PADDING => 22; sub cnparse { my ($callnum) = uc(shift); chomp( $callnum ); my ($initidx, $callnumlength, $isalpha, $isnum, $isalphanum); my ($isseparator, $callnumpartlength, $currchar, $nextchar); my ($stringcheck, $idx, $callnumpart); my @cnpart; my $curralpha = 0; my $currnum = 0; my $charidx = 0; my $callnumpartidx = 0; my $callnumpartcharidx = -1; my $hadperiod = 0; $cnpart[$_] = '' for (0..36); # remove any leading (meaningless) separators $callnum =~ s/^[ ;:,.\/]+//; $callnumlength = length($callnum) - 1; while ($charidx <= $callnumlength) { $currchar = substr($callnum, $charidx, 1); if ($callnumpartidx == 1 and alphacheck($cnpart[0]) and numcheck($cnpart[1]) and $currchar eq $SPACE and substr($callnum, $charidx + 1, 1) ne $PERIOD) { $callnumpartidx++; $callnumpartcharidx--; $cnpart[$callnumpartidx] = ".0"; } $isseparator = sepcheck($currchar); $hadperiod = 1 if $currchar eq $PERIOD; if (!$isseparator) { $isalpha = alphacheck($currchar); $isnum = numcheck($currchar); if (($curralpha && $isalpha) || ($currnum && $isnum) || (!$isalpha && !$isnum)) { $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); } if (!$curralpha && !$currnum) { $curralpha = $isalpha; $currnum = $isnum; $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); } if ( ($curralpha && $isnum) || ($currnum && $isalpha) ) { $callnumpartidx++; $callnumpartcharidx--; if ($curralpha && $isnum) { $callnumpartcharidx = storechar($PERIOD, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); } $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); $curralpha = $isalpha; $currnum = $isnum; } } # isseparator elsif ($charidx < $callnumlength) { $nextchar = substr($callnum, $charidx+1, 1); if (($currchar eq $PERIOD) && ($nextchar eq $SEMICOLON)) { $charidx++; $nextchar = substr($callnum, $charidx+1, 1); } if (sepcheck($nextchar)) { if (($currchar =~ /[;, ]/) and ($nextchar eq $PERIOD)) { $hadperiod = 1; $callnumpartidx++; $callnumpartcharidx--; $callnumpartcharidx = storechar($nextchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); $curralpha = $currnum = 0; $charidx++; } if (($currchar eq $PERIOD) && ($nextchar ne $SPACE)) { $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); $callnumpartidx++; $callnumpartcharidx--; $charidx++; } } # if isseparator # have a separator and nextchar is not a separator else { if ($currchar eq $PERIOD) { if (!$curralpha) {$callnumpartidx++; $callnumpartcharidx--;} $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); if ($curralpha) {$callnumpartidx++; $callnumpartcharidx--;} } else {$callnumpartidx++; $callnumpartcharidx--;} $curralpha = $currnum = 0; } } # isseparator # charidx < callnumlength $charidx++; } # do while charidx <= callnumlength # pad parse groups as necessary for sorting @cnpart = pad_groups(\@cnpart); return @cnpart; } ### the special handling of the period character allows ### numeric chunks parsed out well after the initial ### accompanying period to be correctly treated as part ### of a decimal number. sub storechar { my ($achar, $callnumpartidx, $callnumpartcharidx, $hadperiod, $cnpart) = @_; $callnumpartcharidx++; if (length($cnpart->[$callnumpartidx]) == 0) { $callnumpartcharidx = 0; } if (($achar ne '.') or ($callnumpartcharidx == 0 and $hadperiod)) { substr($cnpart->[$callnumpartidx], $callnumpartcharidx, 1) = $achar; } return $callnumpartcharidx; } ### assumed that current cnpart is numeric ### we don't do anything until we're at least on the 4th cnpart ### if previous cnpart is not empty and starts with a letter, ### look at the cnpart before that (current-2) and ### if that one is all numeric, ### prepend a period to the current cnpart ### if none of the above applies, ### pad the current all-numeric cnpart with leading zeros sub pad_groups { my @cnparts = @{(shift)}; for my $partnumber (0..$#cnparts) { my $part = $cnparts[$partnumber]; if ($part =~ /^\d+[^a-z\d]?$/i) { if ($partnumber > 2 and alphacheck(substr($cnparts[$partnumber-1], 1)) and numcheck($cnparts[$partnumber-2])) { $cnparts[$partnumber] = ".$part"; } else { $cnparts[$partnumber] = sprintf('%0'.PADDING.'d', $part); } } $cnparts[$partnumber] = sprintf('%-'.PADDING.'.'.PADDING.'s', $cnparts[$partnumber]); } return @cnparts; } ### returns: 1 if contains only digits ### 0 if contains at least one other character or if empty string sub numcheck { return ($_[0] =~ /^\d+$/) ? 1 : 0; } ### returns: 1 if contains only letters ### 0 if contains at least one other character or if empty string sub alphacheck { return ($_[0] =~ /^[a-z]+$/i) ? 1 : 0; } ### returns: 1 if contains only letters and digits ### 0 if contains at least one other character or if empty string sub alphanumcheck { return ($_[0] =~ /^[a-z\d]+$/i) ? 1 : 0; } ### returns: 1 if is a section separator ### 0 if not ### separator chars: space, semicolon, colon, comma, period, forward slash sub sepcheck { return ($_[0] =~ /[ ;:,.\/]/) ? 1 : 0; }