From 81946b59bfb937b324221c5910be127a762e1941 Mon Sep 17 00:00:00 2001 From: Shawn Boyette Date: Wed, 17 Sep 2008 21:51:24 +0000 Subject: [PATCH] adding yaz-cleanup, a small tool to scrub data from yaz-marcdump --- Migration Data Work HOWTO.txt | 121 ++++++++++++++++++++++------------------- miker-filter_incumbents.pl | 103 +++++++++++++++++++++++++++++------ yaz-cleanup | 22 ++++++++ 3 files changed, 174 insertions(+), 72 deletions(-) create mode 100755 yaz-cleanup diff --git a/Migration Data Work HOWTO.txt b/Migration Data Work HOWTO.txt index 4bc481c..77147bb 100644 --- a/Migration Data Work HOWTO.txt +++ b/Migration Data Work HOWTO.txt @@ -30,42 +30,42 @@ matching records from the incumbent system fingerprinter -r primary -t 903 -s a -o incoming.primary.fp \ -x incoming.primary.ex incoming.renumbered.mrc.xml -Edit the query_for_primary_matching_incumbent_record.pl script to -point to the correct Evergreen database and table holding the -incumbent primary fingerprints (FIXME add in how to create such a -table). - - query_for_primary_matching_incumbent_record.pl incoming.primary.fp \ - | sort | uniq > primary_matching_incumbent.record_ids - -In a postgres shell, you create a temporary table to hold these id's: - - CREATE TABLE primary_matching_incumbent_records_for_incoming_library - (id BIGINT); - COPY primary_matching_incumbent_records_for_incoming_library - FROM 'primary_matching_incumbent.record_ids'; - -To dump the matching incumbent records to a file, in a postgres shell -do: - - matching_incumbent_records.dump SELECT b.id, b.tcn_source, b.tcn_value, - regexp_replace(b.marc,E'\n','','g') - FROM biblio.record_entry AS b - JOIN primary_matching_incumbent_records_for_incoming_library - AS c using ( id ); - -Now to turn that dump into a MARCXML file with record numbers and TCN -embedded in tag 901, do: - - marc_add_ids -f id -f tcn_source -f tcn_value -f marc \ - < matching_incumbent_records.dump > matching_incumbent_records.marc.xml - -It's possible that this file may need to be itself sanitized some. -This will transform code=""" into code="&x0022;", for example: - - cat matching_incumbent_records.marc.xml | \ - sed 's/code=\"\"\"/code=\"\"\"/' \ - > matching_incumbent_records.escaped.mrc.xml +#Edit the query_for_primary_matching_incumbent_record.pl script to +#point to the correct Evergreen database and table holding the +#incumbent primary fingerprints (FIXME add in how to create such a +#table). +# +# query_for_primary_matching_incumbent_record.pl incoming.primary.fp \ +# | sort | uniq > primary_matching_incumbent.record_ids +# +#In a postgres shell, you create a temporary table to hold these id's: +# +# CREATE TABLE primary_matching_incumbent_records_for_incoming_library +# (id BIGINT); +# COPY primary_matching_incumbent_records_for_incoming_library +# FROM 'primary_matching_incumbent.record_ids'; +# +#To dump the matching incumbent records to a file, in a postgres shell +#do: +# +# matching_incumbent_records.dump SELECT b.id, b.tcn_source, b.tcn_value, +# regexp_replace(b.marc,E'\n','','g') +# FROM biblio.record_entry AS b +# JOIN primary_matching_incumbent_records_for_incoming_library +# AS c using ( id ); +# +#Now to turn that dump into a MARCXML file with record numbers and TCN +#embedded in tag 901, do: +# +# marc_add_ids -f id -f tcn_source -f tcn_value -f marc \ +# < matching_incumbent_records.dump > matching_incumbent_records.marc.xml +# +#It's possible that this file may need to be itself sanitized some. +#This will transform code=""" into code="&x0022;", for example: +# +# cat matching_incumbent_records.marc.xml | \ +# sed 's/code=\"\"\"/code=\"\"\"/' \ +# > matching_incumbent_records.escaped.mrc.xml Get full fingerprints for both datasets and match them. @@ -97,24 +97,33 @@ Now to tar up the specific MARC records involved for the dedup interface: cat match.groupings | cut -d^ -f3 > incumbent.record_ids cat match.groupings | cut -d^ -f5 | cut -d, -f2- | sed 's/,/\n/g' \ > incoming.record_ids - mkdir dataset ; cd dataset - select_marc.pl ../incumbent.record_ids 901 c \ - ../matching_incumbent_records.mrc.xml - select_marc.pl ../incoming.record_ids 903 a \ - ../incoming.renumbered.mrc.xml - cd .. - tar cvf dataset.tar dataset - -In a mysql shell for the database used with the dedup interface: - - LOAD DATA LOCAL INFILE 'match.groupings' INTO TABLE record_group - FIELDS TERMINATED BY '^' - ( status, dataset, best_record,records,original_records ); - -Create a pretty printed text dump of the non-matching incoming records: - - dump_inverse_select_marc.pl incoming.record_ids 903 a \ - incoming.renumbered.mrc.xml > non_matching_incoming.mrc.txt 2> \ - non_matching_incoming.mrc.txt.err - +# mkdir dataset ; cd dataset +# select_marc.pl ../incumbent.record_ids 901 c \ +# ../matching_incumbent_records.mrc.xml +# select_marc.pl ../incoming.record_ids 903 a \ +# ../incoming.renumbered.mrc.xml +# cd .. +# tar cvf dataset.tar dataset +# +#In a mysql shell for the database used with the dedup interface: +# +# LOAD DATA LOCAL INFILE 'match.groupings' INTO TABLE record_group +# FIELDS TERMINATED BY '^' +# ( status, dataset, best_record,records,original_records ); +# +#Create a pretty printed text dump of the non-matching incoming records: +# +# dump_inverse_select_marc.pl incoming.record_ids 903 a \ +# incoming.renumbered.mrc.xml > non_matching_incoming.mrc.txt 2> \ +# non_matching_incoming.mrc.txt.err + +marc2bre.pl --idfield=903 --dontuse=live_tcns.txt -f +quitman_non_matching_incoming.mrc.xml -f +catoosa_non_matching_incoming.mrc.xml --marctype=XML > some.bre + +direct_ingest.pl < some.bre > some.ingest + +perl pg_loader.pl -or bre -or mrd -or mfr -or mtfe -or mafe -or msfe +-or mkfe -or msefe -a mrd -a mfr -a mtfe -a mafe -a msfe -a mkfe -a +msefe < ~/gutenberg.ingest > ~/gutenberg.sql diff --git a/miker-filter_incumbents.pl b/miker-filter_incumbents.pl index c68b9c7..d30d81d 100644 --- a/miker-filter_incumbents.pl +++ b/miker-filter_incumbents.pl @@ -1,8 +1,15 @@ #!/usr/bin/perl +use warnings; +use strict; -use Time::HiRes qw/time/; -use MARC::Record; -use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use Getopt::Long; +#use Time::HiRes qw/time/; +#use MARC::Record; +#use MARC::File::XML ( BinaryEncoding => 'utf-8' ); + +# configuration hashref +my $conf = (); +initialize($conf); my $idfile = shift; my $marcfile = shift; @@ -29,20 +36,84 @@ my $count = 0; my $icount = 0; my $scount = 0; while (<$M>) { + /tag="901" ind1=" " ind2=" ">(\d+){help}); + + $c->{'incoming-tag'} = 903; + $c->{'incoming-subfield'} = 'a'; + $c->{'incoming-matchfile'} = ''; + $c->{'incoming-nomatchfile'} = ''; + $c->{'incumbent-tag'} = 901; + $c->{'incumbent-subfield'} = 'a'; + $c->{'incumbent-matchfile'} = ''; + $c->{'incumbent-nomatchfile'} = ''; + my @keys = keys %{$c}; + show_help() unless (@ARGV and @keys); + for my $key ('renumber-from', 'tag', 'subfield', 'output') + { push @missing, $key unless $c->{$key} } + if (@missing) { + print "Required option: ", join(', ', @missing), " missing!\n"; + show_help(); + } + +} + + +=head2 show_help + +Display usage message when things go wrong - /tag="901" ind1=" " ind2=" ">(\d+) -o Output filename +Any number of input files may be specified; one output file will result. +HELP +exit 1; } - diff --git a/yaz-cleanup b/yaz-cleanup new file mode 100755 index 0000000..3576735 --- /dev/null +++ b/yaz-cleanup @@ -0,0 +1,22 @@ +#!/usr/bin/perl + +open MARC, '<', 'incoming.marc.xml'; +open NUMARC, '>', 'incoming.clean.marc.xml'; + +$line1 = ; + +while ($line2 = ) { + if ($line1 =~ m//) { + if ($line2 =~ m||) { + $line1 = $line2; + next; + } + } + $line1 =~ s/tag=" /tag="00/g; + $line1 =~ s/tag=" /tag="0/g; + $line1 =~ s/tag="-/tag="0/g; + $line1 =~ s/tag="(\d\d) /tag="0$1/g; + print NUMARC $line1; + $line1 = $line2; +} +print NUMARC $line1; -- 1.7.2.5