changing verbage for bib loading table
[migration-tools.git] / marc_cleanup
index e2a9390..58a68c1 100755 (executable)
@@ -111,7 +111,7 @@ while ( buildrecord() ) {
         # subfields can't be non-alphanumeric
         if ($record[$ptr] =~ /<subfield code="(.*?)"/) {
             if ($1 =~ /\P{IsAlnum}/ or $1 eq '') {
-                edit("Junk in subfield code/Null subfield code");
+                edit("Junk in subfield code/Null subfield code ($1)");
                 next;
             }
         }
@@ -177,6 +177,11 @@ sub do_automated_cleanups {
                 message("Short leader padded");
             }
         }
+        if ($c->{'force-utf8'}) {
+            if ($record[$ptr] =~ m|<leader>(.........).(.+)</leader>|) {
+                $record[$ptr] = "<leader>$1a$2</leader>\n";
+            }
+        }
         if ($record[$ptr] =~ m|<controlfield tag="008">(.+?)</control|) {
             #pad short 008
             my $content = $1;
@@ -200,13 +205,28 @@ sub do_automated_cleanups {
         }
 
         # automatable subfield maladies
-        $record[$ptr] =~ s/code=" ">c/code="c">/;
-        $record[$ptr] =~ s/code=" ">\$/code="c">\$/;
+        if ($record[$ptr] =~ /code=" ">c/) {
+            message('Fixing probable subfield c, scenario 1');
+            $record[$ptr] =~ s/code=" ">c/code="c">/;
+        }
+        if ($record[$ptr] =~ /code=" ">\$/) {
+            message('Fixing probable subfield c, scenario 2');
+            $record[$ptr] =~ s/code=" ">\$/code="c">\$/;
+        }
 
         if ($c->{'fix-subfield'}) {
-            $record[$ptr] =~ s/code="&amp;">/code="$c->{'fix-subfield'}">/;
-            $record[$ptr] =~ s/code="\P{IsAlnum}">/code="$c->{'fix-subfield'}">/;
-            $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/;
+            if ($record[$ptr] =~ /code="&amp;">/) {
+                message('Fixing &amp; for subfield code');
+                $record[$ptr] =~ s/code="&amp;">/code="$c->{'fix-subfield'}">/;
+            }
+            if ($record[$ptr] =~ /code="(.*?\P{IsAlnum}.*?)">/) {
+                message("Fixing non-alphanumeric subfield code: $1 -> " . $c->{'fix-subfield'});
+                $record[$ptr] =~ s/code=".*?\P{IsAlnum}.*?">/code="$c->{'fix-subfield'}">/;
+            }
+            if ($record[$ptr] =~ /code="">/) {
+                message('Fixing null subfield code');
+                $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/;
+            }
         }
     }
     return 0;
@@ -622,6 +642,7 @@ sub initialize {
                          'original-tag|ot=i',
                          'original-subfield|os=s',
                          'fix-subfield|fs=s',
+                         'force-utf8',
                          'script',
                          'no-strip9',
                          'trashfile|t=s',
@@ -685,6 +706,8 @@ Options
                            and renumbering is in effect, an old-to-new mapping
                            file (old2new.map) will be generated.
 
+  --force-utf8             Rewrite each record so that they describe themselves as
+                           UTF-8 encoded
   --autoscrub         -a   Automatically remove non-numeric tags in data
   --fix-subfield      -fs  Subfield code to use in place of non-alphanumeric
                            or empty subfield codes