From 81946b59bfb937b324221c5910be127a762e1941 Mon Sep 17 00:00:00 2001
From: Shawn Boyette <sboyette@esilibrary.com>
Date: Wed, 17 Sep 2008 21:51:24 +0000
Subject: [PATCH] adding yaz-cleanup, a small tool to scrub data from yaz-marcdump

---
 Migration Data Work HOWTO.txt |  121 ++++++++++++++++++++++-------------------
 miker-filter_incumbents.pl    |  103 +++++++++++++++++++++++++++++------
 yaz-cleanup                   |   22 ++++++++
 3 files changed, 174 insertions(+), 72 deletions(-)
 create mode 100755 yaz-cleanup

diff --git a/Migration Data Work HOWTO.txt b/Migration Data Work HOWTO.txt
index 4bc481c..77147bb 100644
--- a/Migration Data Work HOWTO.txt	
+++ b/Migration Data Work HOWTO.txt	
@@ -30,42 +30,42 @@ matching records from the incumbent system
   fingerprinter -r primary -t 903 -s a -o incoming.primary.fp \
     -x incoming.primary.ex incoming.renumbered.mrc.xml
 
-Edit the query_for_primary_matching_incumbent_record.pl script to
-point to the correct Evergreen database and table holding the
-incumbent primary fingerprints (FIXME add in how to create such a
-table).
-
-  query_for_primary_matching_incumbent_record.pl incoming.primary.fp \
-    | sort | uniq > primary_matching_incumbent.record_ids
-
-In a postgres shell, you create a temporary table to hold these id's:
-
-  CREATE TABLE primary_matching_incumbent_records_for_incoming_library
-         (id BIGINT);
-  COPY primary_matching_incumbent_records_for_incoming_library
-       FROM 'primary_matching_incumbent.record_ids';
-
-To dump the matching incumbent records to a file, in a postgres shell
-do:
-
-  matching_incumbent_records.dump SELECT b.id, b.tcn_source, b.tcn_value,
-    regexp_replace(b.marc,E'\n','','g')
-    FROM biblio.record_entry AS b
-    JOIN primary_matching_incumbent_records_for_incoming_library
-    AS c using ( id );
-
-Now to turn that dump into a MARCXML file with record numbers and TCN
-embedded in tag 901, do:
-
-  marc_add_ids -f id -f tcn_source -f tcn_value -f marc \
-    < matching_incumbent_records.dump > matching_incumbent_records.marc.xml
-
-It's possible that this file may need to be itself sanitized some.
-This will transform code=""" into code="&x0022;", for example:
-
-  cat matching_incumbent_records.marc.xml | \
-    sed 's/code=\"\"\"/code=\"\&#x0022;\"/' \
-    > matching_incumbent_records.escaped.mrc.xml
+#Edit the query_for_primary_matching_incumbent_record.pl script to
+#point to the correct Evergreen database and table holding the
+#incumbent primary fingerprints (FIXME add in how to create such a
+#table).
+#
+#  query_for_primary_matching_incumbent_record.pl incoming.primary.fp \
+#    | sort | uniq > primary_matching_incumbent.record_ids
+#
+#In a postgres shell, you create a temporary table to hold these id's:
+#
+#  CREATE TABLE primary_matching_incumbent_records_for_incoming_library
+#         (id BIGINT);
+#  COPY primary_matching_incumbent_records_for_incoming_library
+#       FROM 'primary_matching_incumbent.record_ids';
+#
+#To dump the matching incumbent records to a file, in a postgres shell
+#do:
+#
+#  matching_incumbent_records.dump SELECT b.id, b.tcn_source, b.tcn_value,
+#    regexp_replace(b.marc,E'\n','','g')
+#    FROM biblio.record_entry AS b
+#    JOIN primary_matching_incumbent_records_for_incoming_library
+#    AS c using ( id );
+#
+#Now to turn that dump into a MARCXML file with record numbers and TCN
+#embedded in tag 901, do:
+#
+#  marc_add_ids -f id -f tcn_source -f tcn_value -f marc \
+#    < matching_incumbent_records.dump > matching_incumbent_records.marc.xml
+#
+#It's possible that this file may need to be itself sanitized some.
+#This will transform code=""" into code="&x0022;", for example:
+#
+#  cat matching_incumbent_records.marc.xml | \
+#    sed 's/code=\"\"\"/code=\"\&#x0022;\"/' \
+#    > matching_incumbent_records.escaped.mrc.xml
 
 Get full fingerprints for both datasets and match them.
 
@@ -97,24 +97,33 @@ Now to tar up the specific MARC records involved for the dedup interface:
   cat match.groupings | cut -d^ -f3 > incumbent.record_ids
   cat match.groupings | cut -d^ -f5 | cut -d, -f2- | sed 's/,/\n/g' \
     > incoming.record_ids
-  mkdir dataset ; cd dataset
-  select_marc.pl ../incumbent.record_ids 901 c \
-    ../matching_incumbent_records.mrc.xml
-  select_marc.pl ../incoming.record_ids 903 a \
-    ../incoming.renumbered.mrc.xml
-  cd .. 
-  tar cvf dataset.tar dataset
-
-In a mysql shell for the database used with the dedup interface:
-
-  LOAD DATA LOCAL INFILE 'match.groupings' INTO TABLE record_group
-    FIELDS TERMINATED BY '^' 
-    ( status, dataset, best_record,records,original_records );
-
-Create a pretty printed text dump of the non-matching incoming records:
-
-  dump_inverse_select_marc.pl incoming.record_ids 903 a \
-    incoming.renumbered.mrc.xml > non_matching_incoming.mrc.txt 2> \
-    non_matching_incoming.mrc.txt.err
-
 
+#  mkdir dataset ; cd dataset
+#  select_marc.pl ../incumbent.record_ids 901 c \
+#    ../matching_incumbent_records.mrc.xml
+#  select_marc.pl ../incoming.record_ids 903 a \
+#    ../incoming.renumbered.mrc.xml
+#  cd .. 
+#  tar cvf dataset.tar dataset
+#
+#In a mysql shell for the database used with the dedup interface:
+#
+#  LOAD DATA LOCAL INFILE 'match.groupings' INTO TABLE record_group
+#    FIELDS TERMINATED BY '^' 
+#    ( status, dataset, best_record,records,original_records );
+#
+#Create a pretty printed text dump of the non-matching incoming records:
+#
+#  dump_inverse_select_marc.pl incoming.record_ids 903 a \
+#    incoming.renumbered.mrc.xml > non_matching_incoming.mrc.txt 2> \
+#    non_matching_incoming.mrc.txt.err
+
+marc2bre.pl --idfield=903 --dontuse=live_tcns.txt -f
+quitman_non_matching_incoming.mrc.xml -f
+catoosa_non_matching_incoming.mrc.xml --marctype=XML > some.bre
+
+direct_ingest.pl < some.bre > some.ingest
+
+perl pg_loader.pl -or bre -or mrd -or mfr -or mtfe -or mafe -or msfe
+-or mkfe -or msefe -a mrd -a mfr -a mtfe -a mafe -a msfe -a mkfe -a
+msefe < ~/gutenberg.ingest > ~/gutenberg.sql
diff --git a/miker-filter_incumbents.pl b/miker-filter_incumbents.pl
index c68b9c7..d30d81d 100644
--- a/miker-filter_incumbents.pl
+++ b/miker-filter_incumbents.pl
@@ -1,8 +1,15 @@
 #!/usr/bin/perl
+use warnings;
+use strict;
 
-use Time::HiRes qw/time/;
-use MARC::Record;
-use MARC::File::XML ( BinaryEncoding => 'utf-8' );
+use Getopt::Long;
+#use Time::HiRes qw/time/;
+#use MARC::Record;
+#use MARC::File::XML ( BinaryEncoding => 'utf-8' );
+
+# configuration hashref
+my $conf  = ();
+initialize($conf);
 
 my $idfile = shift;
 my $marcfile = shift;
@@ -29,20 +36,84 @@ my $count = 0;
 my $icount = 0;
 my $scount = 0;
 while (<$M>) {
+    /tag="901" ind1=" " ind2=" "><subfield code="a">(\d+)</;
+    if ( $id{$1} ) {
+        print $I $_;
+        $icount++;
+    } else {
+        print $S $_;
+        $scount++;
+    }
+    $count++;
+
+    unless ($count && $count % 100) {
+        print STDERR "\r$count\t(shelved: $scount, import: $icount)\t". $count / (time - $starttime);
+    }
+}
+
+=head2 initialize
+
+Performs boring script initialization. Handles argument parsing,
+mostly.
+
+=cut
+
+sub initialize {
+    my ($c) = @_;
+    my @missing = ();
+
+    # set mode on existing filehandles
+    binmode(STDIN, ':utf8');
+
+    my $rc = GetOptions( $c,
+                         'incoming',
+                         'incumbent',
+                         'incoming-tag|incot=i',
+                         'incoming-subfield|incos=s',
+                         'incumbent-tag|incut=i',
+                         'incumbent-subfield|incus=s',
+                         'output|o=s',
+                         'help|h',
+                       );
+    show_help() unless $rc;
+    show_help() if ($c->{help});
+
+    $c->{'incoming-tag'}         = 903;
+    $c->{'incoming-subfield'}    = 'a';
+    $c->{'incoming-matchfile'}   = '';
+    $c->{'incoming-nomatchfile'} = '';
+    $c->{'incumbent-tag'}         = 901;
+    $c->{'incumbent-subfield'}    = 'a';
+    $c->{'incumbent-matchfile'}   = '';
+    $c->{'incumbent-nomatchfile'} = '';
+    my @keys = keys %{$c};
+    show_help() unless (@ARGV and @keys);
+    for my $key ('renumber-from', 'tag', 'subfield', 'output')
+      { push @missing, $key unless $c->{$key} }
+    if (@missing) {
+        print "Required option: ", join(', ', @missing), " missing!\n";
+        show_help();
+    }
+
+}
+
+
+=head2 show_help
+
+Display usage message when things go wrong
 
-	/tag="901" ind1=" " ind2=" "><subfield code="a">(\d+)</;
-	if ( $id{$1} ) {
-		print $I $_;
-		$icount++;
-	} else {
-		print $S $_;
-		$scount++;
-	}
-	$count++;
+=cut
 
-	unless ($count && $count % 100) {
-		print STDERR "\r$count\t(shelved: $scount, import: $icount)\t". $count / (time - $starttime);
-	}
+sub show_help {
+print <<HELP;
+Usage is: $0 [REQUIRED ARGS]
+Req'd Arguments
+  --renumber-from=N        -rf First id# of new sequence
+  --tag=N                  -t  Which tag to use
+  --subfield=X             -s  Which subfield to use
+  --output=<file>          -o  Output filename
 
+Any number of input files may be specified; one output file will result.
+HELP
+exit 1;
 }
-		
diff --git a/yaz-cleanup b/yaz-cleanup
new file mode 100755
index 0000000..3576735
--- /dev/null
+++ b/yaz-cleanup
@@ -0,0 +1,22 @@
+#!/usr/bin/perl
+
+open MARC, '<', 'incoming.marc.xml';
+open NUMARC, '>', 'incoming.clean.marc.xml';
+
+$line1 = <MARC>;
+
+while ($line2 = <MARC>) {
+    if ($line1 =~ m/<datafield tag="..." ind1=" " ind2=" ">/) {
+        if ($line2 =~ m|</datafield>|) {
+            $line1 = $line2;
+            next;
+        }
+    }
+    $line1 =~ s/tag="  /tag="00/g;
+    $line1 =~ s/tag=" /tag="0/g;
+    $line1 =~ s/tag="-/tag="0/g;
+    $line1 =~ s/tag="(\d\d) /tag="0$1/g;
+    print NUMARC $line1;
+    $line1 = $line2;
+}
+print NUMARC $line1;
-- 
1.7.2.5