#!/usr/local/bin/perl # fixpatpurgesif.pl ## README ############################################################### # The patron purge program creates a deleted patrons file that receives # a .SIF extension, but it really is not in patron SIF format. # # There are several problems with how this "SIF" file gets created: # 1. There are no EOL characters; the whole file is one looonnng line. # 2. The initial fixed segment is not fixed in length; there can be # more than 3 patgroup-barcode groupings. Pattern matching is used # to determine when these groupings have all been read. # 3. The address count field is disconnected from reality. It is # supposed to be a single digit field (0-9 addresses possible). # Instead it cycles non-contiguously from 1 to 32766, then from # -32768 back down to 1, and then repeats this cycle for larger # numbers of "records" encountered in the input file. Thus, instead # of the required single digit, we get up to 5 additional characters # [-, and 0-9] which shoves the rest of the record to the right. # 4. In the address segment the begin and end date fields are missing # sometimes. Arbitrary begin and end date constant values are added # if necessary. # 5. If there is no address, we (WMU) typically show "address needed". # However, the address part of such records may not meet addr seg # specs as output by the purge process. # # These problems are mostly corrected or rendered harmless and a usable # patron (near) SIF file is created. # # The interactive output shows records created. The number in () indicates # extra records created if a patron has more then 3 barcode+group chunks. # # # CAUTIONARY NOTE: # Not every record is correctly detected. The primary (only?) manifestation # of this is concatenated records. # You might be able to fix this by manually editing these records, if # you're familiar with the patron SIF format. # Possibly a better way to fix this is to change the regular expressions # that are used to detect the beginning of a record. # Search for BEGRECCHECK and make changes in that area as needed (if you're # willing to tackle Perl and regular expressions). # # # Written by Roy Zimmer, Western Michigan University Libraries # ## README ############################################################### #----- USER SETUP -------------------------------------------------------- # The address begin and end dates are often missing. Supply desired values # for these fields inside the double quotes here. my $addrbegdate = "2001-01-01"; my $addrenddate = "2003-02-28"; #----- USER SETUP -------------------------------------------------------- ###### DO NOT MAKE CHANGES BELOW ### UNLESS YOU ARE COMFORTABLE WITH PERL AND/OR ### PERL REGULAR EXPRESSIONS. SINCE THIS PROGRAM ### CREATES A MODIFIED COPY OF EXISTING DATA, YOU ### WON'T DESTROY ANYTHING, BUT BE CAREFUL IF YOU ### ARE NOT SURE OF WHAT YOU ARE DOING. if ($#ARGV == -1) {usage();} use strict; my ($infile, $outfile, $fopen, $recctr, $recctr2, $addrchars, $fixseg); my ($fixsegpiece, $lineout, $done, $idx, $ch, $addrpiece, $addrsegsize); my ($char45, $char18, $recstart, $adjustment, $pgstuff, $try); my ($ctr, $pos, $machine); my @pgextra; $addrsegsize = 409; # should be 429 if true patron SIF format $infile = $ARGV[0]; $outfile = $ARGV[1]; $fopen = sprintf("Cannot open file %s for output\n", $outfile); open(outfile, ">$outfile") or die $fopen; $fopen = sprintf("Cannot open file %s for input\n", $infile); open(infile, $infile) or die $fopen; $recctr = 1; # this indicates the record currently being output $recctr2 = 0; # count extra records for when there are > 3 barcode+group chunks $addrchars = 0; $fixsegpiece = ''; # get first 45 chars for starters, and for loop processing consistency for (1..45) { $ch = getc(infile); $fixsegpiece .= $ch; } while ($ch ne undef) # a "loose" loop test { print "creating record $recctr\r"; # do the fixed segment (adding to the 45 chars already there) $lineout = $fixsegpiece; $fixsegpiece = ''; # basic 3 patgroup and barcode groupings x 56 + 1st 10 chars (patid) = 178 # 178 - 45 already read = 133 for (1..133) {$lineout .= getc(infile);} # in case there are more then 3 groupings, get the rest $pgstuff = $try = ''; for (1..10) {$try .= getc(infile);} # if not a date, and is not all blank, must be barcode/group data while (($try !~ /\d{4}-\d\d-\d\d/) and ($try ne ' 'x10)) { $pgstuff .= $try; for (1..46) {$pgstuff .= getc(infile);} # finish getting the current grouping $try = ''; for (1..10) {$try .= getc(infile);} } # done getting extra barcode/group stuff; process and save for now @pgextra = (); $idx = $pos = 0; $ctr = 1; while ($pos < length($pgstuff)) { $pgextra[$idx] .= substr($pgstuff, $pos, 56); $pos += 56; $ctr++; if ($ctr > 3) # set up for next grouping of up to 3 { $ctr = 1; $idx++; } } $fixsegpiece .= $try; # in the next line, we get the rest of the fixed segment except for # that last field, the address count in "column 456". for (1..267) {$fixsegpiece .= getc(infile);} $lineout .= $fixsegpiece; # the last character of the fixed segment in the file at this point should # contain the number of address segments (address count). this value is not # accurate in this file (see opening comments above), so I'm arbitrarily # setting this value to *** 1 ***. This is later overwritten if necessary. $lineout .= '1'; # and here we adjust for the mostly oversize address count field by doing a # pattern match for the beginning of the address segment, then dropping any # extra leading characters that are part of the too large address count # number from the fixed segment that "shoved" the address segment over $char18 = ''; for (1..18) {$char18 .= getc(infile);} $done = 0; $idx = 1; while (($idx <= 6) and (!$done)) { # we know that the address segment starts with: # 10 digits, # followed by 1 or 2 or 3, # followed by n or h if (substr($char18, 0, $idx+12) =~ /\d{10}[123][nh]$/i) { $char18 = substr($char18, $idx); $done = 1; } $idx++; } # do the address segment(s) $addrpiece = $char18; # from previous required read-ahead $addrchars = $addrsegsize - length($char18); $ch = getc(infile); while ($addrchars > 0) { $addrpiece .= $ch; $addrchars--; $ch = getc(infile); } $lineout .= $addrpiece; # try for the note, if there is one # at this point, we already have the next character, # read in the previous section $char45 = $ch; $try = 44; while (($try > 0) and ($ch ne undef)) { $ch = getc(infile); $char45 .= $ch; $try--; } # first 20 chars of a patron record should be all digits; # two fields, 10 digits each. in our case (and in most cases), # at least the first 3 digits in each field should be 0. the barcode # (the next 25 characters) should be, for WMU, 14 digits and 11 spaces # the 2nd condition possibility is if the barcode is 11 digits and # 14 spaces, where the barcode happens to not be trailing zero filled. $recstart = 0; while ((!$recstart) and ($ch ne undef)) { # BEGRECCHECK if (($char45 =~ /^000\d{7}000\d{7}\d{14} {11}/) or ($char45 =~ /^000\d{7}000\d{7}\d{11} {14}/)) { # we seem to be at EOL. there is no note, or we just processed it. $recctr++; # output the record addressfixer(); print outfile "$lineout\n"; # if there were more than 3 patron group and barcode groupings, # output as many additional patron records for this patron as are # needed to accomodate the extra groupings, 3 at a time $idx = 0; while ($idx < scalar(@pgextra)) { # if less then 3 groups this time, pad it out appropriately while (length($pgextra[$idx]) < 168) {$pgextra[$idx] .= ' 'x56;} # insert this grouping chunk into the patron record substr($lineout, 10, 168) = $pgextra[$idx]; print outfile "$lineout\n"; $recctr2++; $idx++; } $fixsegpiece = $char45; # start of a new record (usually) $recstart = 1; } else # seem to have a note { $lineout .= substr($char45, 0, 1); # leftmost char must be part of a note $ch = getc(infile); if ($ch ne undef) {$char45 = substr($char45, 1) . $ch;} else # at EOF { $char45 = substr($char45, 1); addressfixer(); print outfile "$lineout$char45\n"; } } } last if ($ch eq undef); } close(outfile); print "output $recctr ($recctr2) records\n"; sub addressfixer # add begin and end dates as necessary # adjust for 2nd address segment if present { # check the first address segment, add begin and end dates if missing if (substr($lineout, 468, 20) !~ /\d{4}-\d\d-\d{6}-\d\d-\d\d/) { $lineout = substr($lineout, 0, 468) . $addrbegdate . $addrenddate . substr($lineout, 468); } # loose check to see if we have two addresses; adjust the count. # add the begin and end dates if necessary if (length($lineout) > 1200) { substr($lineout, 455, 1) = '2'; # are the begin and end dates already there? if (substr($lineout, 897, 20) !~ /\d{4}-\d\d-\d{6}-\d\d-\d\d/) { $lineout = substr($lineout, 0, 897) . $addrbegdate . $addrenddate . substr($lineout, 897); } } } sub usage() { printf ("\nUsage: fixpatpurgesif.pl infile outfile\n"); printf (" Voyager's patron purge creates a faulty patron sif delete file.\n"); printf (" This program restores the missing EOL character for each record.\n"); printf (" Processes infile resulting in outfile.\n"); exit(0); }