X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=marc_cleanup;h=58a68c163e0786ee2ef20ce154ce2de3f3b1e398;hp=e2a9390ecf6a33ce6cf52dc91b5664faf2989c36;hb=4ec23f88c7f20557b13ec6f5be637e726904e114;hpb=693b0fe39fc1dc61f807359a9632fae668659d8c diff --git a/marc_cleanup b/marc_cleanup index e2a9390..58a68c1 100755 --- a/marc_cleanup +++ b/marc_cleanup @@ -111,7 +111,7 @@ while ( buildrecord() ) { # subfields can't be non-alphanumeric if ($record[$ptr] =~ /{'force-utf8'}) { + if ($record[$ptr] =~ m|(.........).(.+)|) { + $record[$ptr] = "$1a$2\n"; + } + } if ($record[$ptr] =~ m|(.+?)c/code="c">/; - $record[$ptr] =~ s/code=" ">\$/code="c">\$/; + if ($record[$ptr] =~ /code=" ">c/) { + message('Fixing probable subfield c, scenario 1'); + $record[$ptr] =~ s/code=" ">c/code="c">/; + } + if ($record[$ptr] =~ /code=" ">\$/) { + message('Fixing probable subfield c, scenario 2'); + $record[$ptr] =~ s/code=" ">\$/code="c">\$/; + } if ($c->{'fix-subfield'}) { - $record[$ptr] =~ s/code="&">/code="$c->{'fix-subfield'}">/; - $record[$ptr] =~ s/code="\P{IsAlnum}">/code="$c->{'fix-subfield'}">/; - $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/; + if ($record[$ptr] =~ /code="&">/) { + message('Fixing & for subfield code'); + $record[$ptr] =~ s/code="&">/code="$c->{'fix-subfield'}">/; + } + if ($record[$ptr] =~ /code="(.*?\P{IsAlnum}.*?)">/) { + message("Fixing non-alphanumeric subfield code: $1 -> " . $c->{'fix-subfield'}); + $record[$ptr] =~ s/code=".*?\P{IsAlnum}.*?">/code="$c->{'fix-subfield'}">/; + } + if ($record[$ptr] =~ /code="">/) { + message('Fixing null subfield code'); + $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/; + } } } return 0; @@ -622,6 +642,7 @@ sub initialize { 'original-tag|ot=i', 'original-subfield|os=s', 'fix-subfield|fs=s', + 'force-utf8', 'script', 'no-strip9', 'trashfile|t=s', @@ -685,6 +706,8 @@ Options and renumbering is in effect, an old-to-new mapping file (old2new.map) will be generated. + --force-utf8 Rewrite each record so that they describe themselves as + UTF-8 encoded --autoscrub -a Automatically remove non-numeric tags in data --fix-subfield -fs Subfield code to use in place of non-alphanumeric or empty subfield codes