#!/usr/local/bin/perl # remapcols.pl # desc: does character remapping by specified columns # wordsettings: # end_autoz # input: text file, field specifications # process: create backup copy of input file with .raw added to the name. # reads input file, converting characters from MARC-8 to Latin-1 # character set where applicable, for each field specified in the # command line, resulting in a temporary file. # this temporary file is then moved "on top of" the input file. # Example: charremappc32.pl myfile.rpt bc1:ec1/bc2:ec2 # myfile.rpt.raw is created for backup purposes. # tempfile containing converted text is created. # rename tempfile myfile.rpt # this results in myfile.rpt (converted) and # myfile.rpt.raw (original) # note: the field length remains unchanged. # padding with spaces or shortening of the field is done, as appropriate # # please see legal notice at end of program ### setup if ($#ARGV < 0) {usage();} $infile = $ARGV[0]; $fieldspec = $ARGV[1]; @fieldbeg = (); @fieldend = (); parsefields(); # create backup copy $command = sprintf("cp %s %s.raw\n", $infile, $infile); system($command); $fopen = sprintf("Cannot open file %s for input\n", $infile); open(infile, $infile) or die $fopen; @lines = ; close(infile); $outfile = ">xxYY123temp"; $fopen = sprintf("Cannot open tempfile for output\n"); open(outfile, $outfile) or die $fopen; # convert file text, line by line, field by specified field $idx = 0; while ($idx < @lines) { $lines[$idx] =~ s/\t/ /g; # replace tabs with space $line = $lines[$idx]; chomp $line; # for correct handling if column goes to end of line $linelen = length($line); $fidx = 0; while ($fidx < @fieldbeg) # remap specified fields { if ($linelen >= $fieldend[$fidx]-1) { $field = substr($line, $fieldbeg[$fidx]-1, $fieldend[$fidx]-$fieldbeg[$fidx]+1); $field = &CharacterMapping($field); $line = substr($line, 0, $fieldbeg[$fidx]-1) . $field . substr($line, $fieldend[$fidx]); } $fidx++; } printf outfile ("%s\n", $line); # restore LF here $idx++; } close(outfile); # replace input file with tempfile, resulting in converted file $command = sprintf("rm %s\n", $infile); system($command); $command = sprintf("mv %s %s\n", substr($outfile, 1), $infile); system($command); sub parsefields() { # get specs for each field @fieldpairs = split /\//, $fieldspec; $errormsg = ''; $idx = 0; while ($idx < @fieldpairs) { ($fieldbeg[$idx], $fieldend[$idx]) = split /:/, $fieldpairs[$idx]; if ($fieldbeg[$idx] == $fieldend[$idx]) { $errormsg = sprintf("Cannot have zero-length field: %d:%d", $fieldbeg[$idx], $fieldend[$idx]); } if ($fieldbeg[$idx] > $fieldend[$idx]) { $errormsg = sprintf("Beginning column must be smaller than ending column: %d:%d", $fieldbeg[$idx], $fieldend[$idx]); } if (($fieldbeg[$idx] <= 0) or ($fieldend[$idx] <= 0)) { $errormsg = sprintf("Cannot have a 0 (zero) or negative numbers for field beginning/ending: %d:%d", $fieldbeg[$idx], $fieldend[$idx]); } $idx++; } if ($idx < 1) {$errormsg = "Must have at least one complete field specification";} if ($errormsg) { printf("%s\n", $errormsg); exit(0); } } sub CharacterMapping # each section below that doesn't do a 1 to 1 remapping # either pads the text with spaces, or cuts characters # off the end to preserve the size of the text. { my ($line) = @_; # Convert input string to hexadecimal $line =~ s/(.)/sprintf ("%%%X", ord($1))/eg; # restore formfeed character as passthrough $line =~ s/^%C/sprintf("%c", 12)/e; my @orphan_chars_combined_5 = ('%1B%67%61%1B%73', '%1B%67%62%1B%73', '%1B%67%63%1B%73', '%1B%62%30%1B%73', '%1B%62%31%1B%73', '%1B%62%32%1B%73', '%1B%62%33%1B%73', '%1B%62%34%1B%73', '%1B%62%35%1B%73', '%1B%62%36%1B%73', '%1B%62%37%1B%73', '%1B%62%38%1B%73', '%1B%62%39%1B%73', '%1B%62%28%1B%73', '%1B%62%2B%1B%73', '%1B%62%29%1B%73', '%1B%70%30%1B%73', '%1B%70%31%1B%73', '%1B%70%32%1B%73', '%1B%70%33%1B%73', '%1B%70%34%1B%73', '%1B%70%35%1B%73', '%1B%70%36%1B%73', '%1B%70%37%1B%73', '%1B%70%38%1B%73', '%1B%70%39%1B%73', '%1B%70%28%1B%73', '%1B%70%2D%1B%73', '%1B%70%2B%1B%73', '%1B%70%29%1B%73' ); my @orphan_chars_combined_2= ('%E0%E6', '%E1%E3', '%E1%E5', '%E1%E6', '%E1%E8', '%E2%A5', '%E2%B5', '%E2%E4', '%E2%E5', '%E2%E6', '%E2%E8', '%E2%EA', '%E2%F0', '%E3%E0', '%E3%E1', '%E3%E2', '%E3%F2', '%E4%E3', '%E4%E6', '%E5%E4', '%E5%E7', '%E5%E8', '%E5%A5', '%E5%B5', '%E5%F1', '%E5%F2', '%E6%F0', '%E6%F2', '%E7%E2', '%E7%F2', '%E8%E4', '%E8%E5', '%E9%E8' ); my @orphan_chars_single = ('%A1','%A3','%A6','%A7','%A9','%AC','%AD','%AE', '%AF','%B0','%B1','%B6','%B7','%B8','%BB', '%BE','%F7' ); my %marc8_to_latin1_combined = ('%E2%41' => 'C1', '%E3%41' => 'C2', '%E4%41' => 'C3', '%E8%41' => 'C4', '%EA%41' => 'C5', '%E2%43' => '43', '%E3%43' => '43', '%F0%43' => 'C7', '%E1%45' => 'C8', '%E2%45' => 'C9', '%E3%45' => 'CA', '%E4%45' => '45', '%E8%45' => 'CB', '%F0%45' => '45', '%E2%47' => '47', '%E3%47' => '47', '%F0%47' => '47', '%E3%48' => '48', '%E8%48' => '48', '%F0%48' => '48', '%E1%49' => 'CC', '%E2%49' => 'CD', '%E3%49' => 'CE', '%E4%49' => '49', '%E8%49' => 'CF', '%E3%4A' => '4A', '%E2%4B' => '4B', '%E3%4B' => '4B', '%F0%4B' => '4B', '%F2%4B' => '4B', '%E2%4C' => '4C', '%E3%4C' => '4C', '%F0%4C' => '4C', '%E2%4D' => '4D', '%E1%4E' => '4E', '%E2%4E' => '4E', '%E4%4E' => 'D1', '%F0%4E' => '4E', '%E1%4F' => 'D2', '%E2%4F' => 'D3', '%E3%4F' => 'D4', '%E4%4F' => 'D5', '%E8%4F' => 'D6', '%E2%50' => '50', '%E2%52' => '52', '%E2%53' => '53', '%E3%53' => '53', '%F0%53' => '53', '%F0%54' => '54', '%E1%55' => 'D9', '%E2%55' => 'DA', '%E3%55' => 'DB', '%E4%55' => '55', '%E8%55' => 'DC', '%EA%55' => '55', '%E4%56' => '56', '%E1%57' => '57', '%E2%57' => '57', '%E3%57' => '57', '%E8%57' => '57', '%E8%58' => '58', '%E1%59' => '59', '%E2%59' => 'DD', '%E3%59' => '59', '%E4%59' => '59', '%E8%59' => '59', '%E2%5A' => '5A', '%E3%5A' => '5A', '%E1%61' => 'E0', '%E2%61' => 'E1', '%E3%61' => 'E2', '%E4%61' => 'E3', '%E8%61' => 'E4', '%EA%61' => 'E5', '%E2%63' => '63', '%E3%63' => '63', '%F0%63' => 'E7', '%E1%65' => 'E8', '%E2%65' => 'E9', '%E3%65' => 'EA', '%E4%65' => '65', '%E8%65' => 'EB', '%F0%65' => '65', '%E2%67' => '67', '%E3%67' => '67', '%F0%67' => '67', '%E3%68' => '68', '%E8%68' => '68', '%F0%68' => '68', '%E1%69' => 'EC', '%E2%69' => 'ED', '%E3%69' => 'EE', '%E4%69' => '69', '%E8%69' => 'EF', '%E3%6A' => '6A', '%E2%6B' => '6B', '%E3%6B' => '6B', '%F0%6B' => '6B', '%F2%6B' => '6B', '%E2%6C' => '6C', '%E3%6C' => '6C', '%F0%6C' => '6C', '%E2%6D' => '6D', '%E1%6E' => '6E', '%E2%6E' => '6E', '%E4%6E' => 'F1', '%F0%6E' => '6E', '%E1%6F' => 'F2', '%E2%6F' => 'F3', '%E3%6F' => 'F4', '%E4%6F' => 'F5', '%E8%6F' => 'F6', '%E2%70' => '70', '%E2%72' => '72', '%E2%73' => '73', '%E3%73' => '73', '%F0%73' => '73', '%E8%74' => '74', '%F0%74' => '74', '%E1%75' => 'F9', '%E2%75' => 'FA', '%E3%75' => 'FB', '%E4%75' => '75', '%E8%75' => 'FC', '%EA%75' => '75', '%E4%76' => '76', '%E1%77' => '77', '%E2%77' => '77', '%E3%77' => '77', '%E8%77' => '77', '%EA%77' => '77', '%E8%78' => '78', '%E1%79' => '79', '%E2%79' => 'FD', '%E3%79' => '79', '%E4%79' => '79', '%E8%79' => '79', '%EA%79' => '79', '%E8%79' => 'FF', '%E2%7A' => '7A', '%E3%7A' => '7A', '%E2%A2' => '4F', '%E1%AC' => '4F', '%E2%AC' => '4F', '%E4%AC' => '4F', '%E1%AD' => '55', '%E2%AD' => '55', '%E4%AD' => '55', '%E2%B2' => '6F', '%E1%BC' => '6F', '%E2%BC' => '6F', '%E4%BC' => '6F', '%E1%BD' => '75', '%E2%BD' => '75', '%E4%BD' => '75', '%E9%73' => '73', '%E9%7A' => '7A', '%E9%53' => '53', '%E9%5A' => '5A', '%E1%41' => 'C0' ); # E1-41 is here for convenience; don't know if *should* be here my %marc8_to_latin1_single = ('%A2' => 'D8', '%A4' => 'DE', '%A5' => 'C6', '%A8' => 'B7', '%AA' => 'AE', '%AB' => 'B1', '%B2' => 'F8', '%B4' => 'FE', '%B5' => 'E6', '%B9' => 'A3', '%BA' => 'F0', '%C0' => 'B0', '%C3' => 'A9', '%C5' => 'BF', '%C6' => 'A1', '%B3' => '64' ); my %marc8_fin_to_latin1_single = ('%83' => 'C5', '%84' => 'C4', '%85' => 'D6', '%86' => 'E5', '%87' => 'E4', '%88' => 'F6' ); $count = 0; foreach my $char1 (@orphan_chars_combined_5) { while ($line =~ /$char1/g) {$count++;} $line =~ s/$char1//g; $count = $count * 5; $line .= ' ' x $count; $count = 0; } $count = 0; foreach my $char1 (@orphan_chars_combined_2) { while ($line =~ /$char1/g) {$count++;} $line =~ s/$char1//g; $count = $count * 2; $line .= ' ' x $count; $count = 0; } $count = 0; foreach my $marc_char1 (keys (%marc8_to_latin1_combined)) { while ($line =~ /$marc_char1/g) {$count++;} $line =~ s/$marc_char1/pack("C", hex($marc8_to_latin1_combined{$marc_char1}))/eg; $line .= ' ' x $count; $count = 0; } $count = 0; foreach my $char2 (@orphan_chars_single) { while ($line =~ /$char2/g) {$count++;} $line =~ s/$char2//g; $line .= ' ' x $count; $count = 0; } foreach my $marc_char2 (keys (%marc8_to_latin1_single)) { $line =~ s/$marc_char2/pack("C", hex($marc8_to_latin1_single{$marc_char2}))/eg; } $count = 0; while ($line =~ /%BD/g) {$count++;} if ($count > 0) { $line =~ s/%BD/%75%27/g; $line = substr($line, 0, length($line)-(3*$count)); $count = 0; } $count = 0; while ($line =~ /%BC/g) {$count++;} if ($count > 0) { $line =~ s/%BC/%6F%27/g; $line = substr($line, 0, length($line)-(3*$count)); } # restore original character format $line =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/eg; return($line); } sub usage { printf("\nUsage: remapcols.pl filespec fieldspec\n"); printf(" filespec is the file to be remapped (include path if necessary).\n"); printf(" filespec.raw is created as a safe copy of the file.\n"); printf(" fieldspec contains 1 or more begcol+endcol pairs for each field.\n"); printf(" Use the slash '/' to separate column pairs.\n"); printf(" Example:\n"); printf(" remapcols.pl abc.rpt 32:36/48:65\n"); printf(" This means that in file abc.rpt, a field in columns 32-36 and another\n"); printf(" field in columns 48-65 will be remapped.\n"); printf(" Note that the column range is inclusive.\n"); exit(0); } ######################################################################## ### I have taken a large portion of the charactermapping subroutine ### ### from Michael Doran's newbooks.pl code. ### ### Many modification have been made so that the remapping is ### ### suitable for our uses. ### ### Thus the following legalese is included. ### ### -- Roy Zimmer, on behalf of University Libraries, ### ### Western Michigan University---Fall 2002 ### ######################################################################## ######################################################################## # # newbooks.pl : a New Books List program # # Version: 5.0 # # Created by Michael Doran, doran@uta.edu # # University of Texas at Arlington Libraries # Box 19497, Arlington, TX 76019, USA # # This Perl program has two distinct parts: # 1) The first connects to the Voyager database and # extracts data on "new" items via an SQL query. # The data undergoes a character set conversion # as it is being output to a flat-file database. # 2) The second part of the script transfers the # flat-file database to where it can be accessed # by the New Books CGI program (newbooks.cgi). # # More information at: http://rocky.uta.edu/doran/autolist/ # ######################################################################## # # Copyright 2000-2002, The University of Texas at Arlington ("UTA"). # All rights reserved. # # By using this software the USER indicates that he or she # has read, understood and and will comply with the following: # # UTA hereby grants USER permission to use, copy, modify, and # distribute this software and its documentation for any # purpose and without fee, provided that: # # 1. the above copyright notice appears in all copies of the # software and its documentation, or portions thereof, and # # 2. a full copy of this notice is included with the software # and its documentation, or portions thereof, and # # 3. neither the software nor its documentation, nor portions # thereof, is sold for profit. Any commercial sale or license # of this software, copies of the software, its associated # documentation and/or modifications of either is strictly # prohibited without the prior consent of UTA. # # Title to copyright to this software and its associated # documentation shall at all times remain with UTA. No right # is granted to use in advertising, publicity or otherwise any # trademark, service mark, or the name of UTA. # # This software and any associated documentation are provided # "as is," and UTA MAKES NO REPRESENTATIONS OR WARRANTIES, # EXPRESSED OR IMPLIED, INCLUDING THOSE OF MERCHANTABILITY OR # FITNESS FOR A PARTICULAR PURPOSE, OR THAT USE OF THE SOFTWARE, # MODIFICATIONS, OR ASSOCIATED DOCUMENTATION WILL NOT INFRINGE # ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER INTELLECTUAL # PROPERTY RIGHTS OF A THIRD PARTY. UTA, The University of Texas # System, its Regents, officers, and employees shall not be # liable under any circumstances for any direct, indirect, special, # incidental, or consequential damages with respect to any claim # by USER or any third party on account of or arising from the # use, or inability to use, this software or its associated # documentation, even if UTA has been advised of the possibility # of those damages. # # Submit commercialization requests to: The University of Texas # at Arlington, Office of Grant and Contract Services, 701 South # Nedderman Drive, Box 19145, Arlington, Texas 76019-0145, # ATTN: Director of Technology Transfer. # ######################################################################## # # "Voyager" is a trademark of Endeavor Information Systems, Inc. # ########################################################################