fingerprinter -r primary -t 903 -s a -o incoming.primary.fp \
-x incoming.primary.ex incoming.renumbered.mrc.xml
-Edit the query_for_primary_matching_incumbent_record.pl script to
-point to the correct Evergreen database and table holding the
-incumbent primary fingerprints (FIXME add in how to create such a
-table).
-
- query_for_primary_matching_incumbent_record.pl incoming.primary.fp \
- | sort | uniq > primary_matching_incumbent.record_ids
-
-In a postgres shell, you create a temporary table to hold these id's:
-
- CREATE TABLE primary_matching_incumbent_records_for_incoming_library
- (id BIGINT);
- COPY primary_matching_incumbent_records_for_incoming_library
- FROM 'primary_matching_incumbent.record_ids';
-
-To dump the matching incumbent records to a file, in a postgres shell
-do:
-
- matching_incumbent_records.dump SELECT b.id, b.tcn_source, b.tcn_value,
- regexp_replace(b.marc,E'\n','','g')
- FROM biblio.record_entry AS b
- JOIN primary_matching_incumbent_records_for_incoming_library
- AS c using ( id );
-
-Now to turn that dump into a MARCXML file with record numbers and TCN
-embedded in tag 901, do:
-
- marc_add_ids -f id -f tcn_source -f tcn_value -f marc \
- < matching_incumbent_records.dump > matching_incumbent_records.marc.xml
-
-It's possible that this file may need to be itself sanitized some.
-This will transform code=""" into code="&x0022;", for example:
-
- cat matching_incumbent_records.marc.xml | \
- sed 's/code=\"\"\"/code=\"\"\"/' \
- > matching_incumbent_records.escaped.mrc.xml
+#Edit the query_for_primary_matching_incumbent_record.pl script to
+#point to the correct Evergreen database and table holding the
+#incumbent primary fingerprints (FIXME add in how to create such a
+#table).
+#
+# query_for_primary_matching_incumbent_record.pl incoming.primary.fp \
+# | sort | uniq > primary_matching_incumbent.record_ids
+#
+#In a postgres shell, you create a temporary table to hold these id's:
+#
+# CREATE TABLE primary_matching_incumbent_records_for_incoming_library
+# (id BIGINT);
+# COPY primary_matching_incumbent_records_for_incoming_library
+# FROM 'primary_matching_incumbent.record_ids';
+#
+#To dump the matching incumbent records to a file, in a postgres shell
+#do:
+#
+# matching_incumbent_records.dump SELECT b.id, b.tcn_source, b.tcn_value,
+# regexp_replace(b.marc,E'\n','','g')
+# FROM biblio.record_entry AS b
+# JOIN primary_matching_incumbent_records_for_incoming_library
+# AS c using ( id );
+#
+#Now to turn that dump into a MARCXML file with record numbers and TCN
+#embedded in tag 901, do:
+#
+# marc_add_ids -f id -f tcn_source -f tcn_value -f marc \
+# < matching_incumbent_records.dump > matching_incumbent_records.marc.xml
+#
+#It's possible that this file may need to be itself sanitized some.
+#This will transform code=""" into code="&x0022;", for example:
+#
+# cat matching_incumbent_records.marc.xml | \
+# sed 's/code=\"\"\"/code=\"\"\"/' \
+# > matching_incumbent_records.escaped.mrc.xml
Get full fingerprints for both datasets and match them.
cat match.groupings | cut -d^ -f3 > incumbent.record_ids
cat match.groupings | cut -d^ -f5 | cut -d, -f2- | sed 's/,/\n/g' \
> incoming.record_ids
- mkdir dataset ; cd dataset
- select_marc.pl ../incumbent.record_ids 901 c \
- ../matching_incumbent_records.mrc.xml
- select_marc.pl ../incoming.record_ids 903 a \
- ../incoming.renumbered.mrc.xml
- cd ..
- tar cvf dataset.tar dataset
-
-In a mysql shell for the database used with the dedup interface:
-
- LOAD DATA LOCAL INFILE 'match.groupings' INTO TABLE record_group
- FIELDS TERMINATED BY '^'
- ( status, dataset, best_record,records,original_records );
-
-Create a pretty printed text dump of the non-matching incoming records:
-
- dump_inverse_select_marc.pl incoming.record_ids 903 a \
- incoming.renumbered.mrc.xml > non_matching_incoming.mrc.txt 2> \
- non_matching_incoming.mrc.txt.err
-
+# mkdir dataset ; cd dataset
+# select_marc.pl ../incumbent.record_ids 901 c \
+# ../matching_incumbent_records.mrc.xml
+# select_marc.pl ../incoming.record_ids 903 a \
+# ../incoming.renumbered.mrc.xml
+# cd ..
+# tar cvf dataset.tar dataset
+#
+#In a mysql shell for the database used with the dedup interface:
+#
+# LOAD DATA LOCAL INFILE 'match.groupings' INTO TABLE record_group
+# FIELDS TERMINATED BY '^'
+# ( status, dataset, best_record,records,original_records );
+#
+#Create a pretty printed text dump of the non-matching incoming records:
+#
+# dump_inverse_select_marc.pl incoming.record_ids 903 a \
+# incoming.renumbered.mrc.xml > non_matching_incoming.mrc.txt 2> \
+# non_matching_incoming.mrc.txt.err
+
+marc2bre.pl --idfield=903 --dontuse=live_tcns.txt -f
+quitman_non_matching_incoming.mrc.xml -f
+catoosa_non_matching_incoming.mrc.xml --marctype=XML > some.bre
+
+direct_ingest.pl < some.bre > some.ingest
+
+perl pg_loader.pl -or bre -or mrd -or mfr -or mtfe -or mafe -or msfe
+-or mkfe -or msefe -a mrd -a mfr -a mtfe -a mafe -a msfe -a mkfe -a
+msefe < ~/gutenberg.ingest > ~/gutenberg.sql
#!/usr/bin/perl
+use warnings;
+use strict;
-use Time::HiRes qw/time/;
-use MARC::Record;
-use MARC::File::XML ( BinaryEncoding => 'utf-8' );
+use Getopt::Long;
+#use Time::HiRes qw/time/;
+#use MARC::Record;
+#use MARC::File::XML ( BinaryEncoding => 'utf-8' );
+
+# configuration hashref
+my $conf = ();
+initialize($conf);
my $idfile = shift;
my $marcfile = shift;
my $icount = 0;
my $scount = 0;
while (<$M>) {
+ /tag="901" ind1=" " ind2=" "><subfield code="a">(\d+)</;
+ if ( $id{$1} ) {
+ print $I $_;
+ $icount++;
+ } else {
+ print $S $_;
+ $scount++;
+ }
+ $count++;
+
+ unless ($count && $count % 100) {
+ print STDERR "\r$count\t(shelved: $scount, import: $icount)\t". $count / (time - $starttime);
+ }
+}
+
+=head2 initialize
+
+Performs boring script initialization. Handles argument parsing,
+mostly.
+
+=cut
+
+sub initialize {
+ my ($c) = @_;
+ my @missing = ();
+
+ # set mode on existing filehandles
+ binmode(STDIN, ':utf8');
+
+ my $rc = GetOptions( $c,
+ 'incoming',
+ 'incumbent',
+ 'incoming-tag|incot=i',
+ 'incoming-subfield|incos=s',
+ 'incumbent-tag|incut=i',
+ 'incumbent-subfield|incus=s',
+ 'output|o=s',
+ 'help|h',
+ );
+ show_help() unless $rc;
+ show_help() if ($c->{help});
+
+ $c->{'incoming-tag'} = 903;
+ $c->{'incoming-subfield'} = 'a';
+ $c->{'incoming-matchfile'} = '';
+ $c->{'incoming-nomatchfile'} = '';
+ $c->{'incumbent-tag'} = 901;
+ $c->{'incumbent-subfield'} = 'a';
+ $c->{'incumbent-matchfile'} = '';
+ $c->{'incumbent-nomatchfile'} = '';
+ my @keys = keys %{$c};
+ show_help() unless (@ARGV and @keys);
+ for my $key ('renumber-from', 'tag', 'subfield', 'output')
+ { push @missing, $key unless $c->{$key} }
+ if (@missing) {
+ print "Required option: ", join(', ', @missing), " missing!\n";
+ show_help();
+ }
+
+}
+
+
+=head2 show_help
+
+Display usage message when things go wrong
- /tag="901" ind1=" " ind2=" "><subfield code="a">(\d+)</;
- if ( $id{$1} ) {
- print $I $_;
- $icount++;
- } else {
- print $S $_;
- $scount++;
- }
- $count++;
+=cut
- unless ($count && $count % 100) {
- print STDERR "\r$count\t(shelved: $scount, import: $icount)\t". $count / (time - $starttime);
- }
+sub show_help {
+print <<HELP;
+Usage is: $0 [REQUIRED ARGS]
+Req'd Arguments
+ --renumber-from=N -rf First id# of new sequence
+ --tag=N -t Which tag to use
+ --subfield=X -s Which subfield to use
+ --output=<file> -o Output filename
+Any number of input files may be specified; one output file will result.
+HELP
+exit 1;
}
-