From: Jason Etheridge Date: Tue, 27 May 2008 19:20:00 +0000 (+0000) Subject: initial import X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=commitdiff_plain;h=34597da047b2257b29ce1da435c1ebe3eef972f2 initial import --- 34597da047b2257b29ce1da435c1ebe3eef972f2 diff --git a/Migration Data Work HOWTO.pdf b/Migration Data Work HOWTO.pdf new file mode 100644 index 0000000..47d8857 Binary files /dev/null and b/Migration Data Work HOWTO.pdf differ diff --git a/Migration Data Work HOWTO.txt b/Migration Data Work HOWTO.txt new file mode 100644 index 0000000..409a593 --- /dev/null +++ b/Migration Data Work HOWTO.txt @@ -0,0 +1,74 @@ + +Migration Data Work HOWTO / Toolkit +The following is for migrating into an existing system like PINES: + +Get the incoming bib data, and translate to UTF-8 MARCXML.  It may contain holdings.  It may contain XML or MARC errors that you have to sanitize before your tools will work. +This is one way to translate MARC-8 MARC21 to UTF-8 MARCXML: +yaz-marcdump -f MARC-8 -t UTF-8 -o marcxml incoming.mrc » incoming.mrc.xml + + +If you need to trim the bibs to a subset based on the presence of a certain value in a specific tag/subfield (for example, if you have the bibs for all libraries in a foreign system and only need bibs belonging to a specific migrating library, you might filter based on their holding tags) +trim_marc_based_on_tag_subfield_value.pl 999 m BRANCH_CODE incoming.mrc.xml » incoming.filtered.mrc.xml + + +Embed potential native record ids into the incumbent records + +set_record_ids.pl 100000 903 a incoming.mrc.xml » incoming.renumbered.mrc.xml + + +Get primary fingerprints for incoming data and get a bib dump of matching records from the incumbent system + +fingerprints.pl primary 903 a incoming.renumbered.mrc.xml » incoming.primary.fp 2» incoming.primary.fp_err + +Edit the query_for_primary_matching_incumbent_record.pl script to point to the correct Evergreen database and table holding the incumbent primary fingerprints (FIXME add in how to create such a table). +query_for_primary_matching_incumbent_record.pl incoming.primary.fp | sort | uniq » primary_matching_incumbent.record_ids + +In a postgres shell, you create a temporary table to hold these id's: +CREATE TABLE primary_matching_incumbent_records_for_incoming_library ( id BIGINT ); +COPY primary_matching_incumbent_records_for_incoming_library FROM 'primary_matching_incumbent.record_ids'; + +To dump the matching incumbent records to a file, in a postgres shell do: +\t +\o matching_incumbent_records.dump +select b.id, b.tcn_source, b.tcn_value, regexp_replace(b.marc,E'\n','','g') from biblio.record_entry as b join primary_matching_incumbent_records_for_incoming_library as c using ( id ) ; + +Now to turn that dump into a MARCXML file with record numbers and TCN embedded in tag 901, do: +marc_add_ids -f id -f tcn_source -f tcn_value -f marc « matching_incumbent_records.dump » matching_incumbent_records.mrc.xml + + +It's possible that this file may need to be itself sanitized some.  This will transform code=""" into code="&x0022;", for example: +cat matching_incumbent_records.mrc.xml | sed 's/code=\"\"\"/code=\"\"\"/' » matching_incumbent_records.escaped.mrc.xml + +Get full fingerprints for both datasets and match them. + +fingerprints.pl full 901 c matching_incumbent_records.mrc.xml » incumbent.fp 2» incumbent.fp_err +fingerprints.pl full 903 a incoming.renumbered.mrc.xml » incoming.fp 2» incoming.fp_err + +The script below will produce matched groupings, and can optionally take a 4th and 5th parameter providing scoring information for determining lead records.  In the past, this would consider certain metrics for MARC quality, but in the latest incarnation, it assumes an incumbent record will be the lead record, and looks at # of holdings and possible matching of tag 245 subfield b for determining which of the incumbent records would be the lead record.  The example invocation below does not use scoring. + +match_fingerprints.pl "name of dataset for dedup interface" incumbent.fp incoming.fp + +This will produce two files, match.groupings and match.record_ids.  The format for match.groupings is suitable for insertion into the db for the dedup interface.  + +Import these matches and records into the legacy dedup interface for viewing: + +Now to tar up the specific MARC records involved for the dedup interface: + +cat match.groupings | cut -d^ -f3 » incumbent.record_ids +cat match.groupings | cut -d^ -f5 | cut -d, -f2- | sed 's/,/\n/g' » incoming.record_ids + +mkdir dataset ; cd dataset +select_marc.pl ../incumbent.record_ids 901 c ../matching_incumbent_records.mrc.xml +select_marc.pl ../incoming.record_ids 903 a ../incoming.renumbered.mrc.xml +cd .. +tar cvf dataset.tar dataset + +In a mysql shell for the database used with the dedup interface: +LOAD DATA LOCAL INFILE 'match.groupings' INTO TABLE record_group FIELDS TERMINATED BY '^' ( status, dataset, best_record,records,original_records ); + + +Create a pretty printed text dump of the non-matching incoming records: + +dump_inverse_select_marc.pl incoming.record_ids 903 a incoming.renumbered.mrc.xml » non_matching_incoming.mrc.txt 2» non_matching_incoming.mrc.txt.err + + diff --git a/dump_inverse_select_marc.pl b/dump_inverse_select_marc.pl new file mode 100755 index 0000000..e71997a --- /dev/null +++ b/dump_inverse_select_marc.pl @@ -0,0 +1,48 @@ +#!/usr/bin/perl +use MARC::Batch; +use MARC::Record; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Field; + +my $record_id_file = $ARGV[0]; +my %record_ids; + +open FILE, $record_id_file; +while (my $record_id = ) { + chomp($record_id); $record_ids{ $record_id } = 1; +} +close FILE; + +my $id_tag = $ARGV[1]; my $id_subfield = $ARGV[2]; + +binmode(STDOUT, ':utf8'); +binmode(STDIN, ':utf8'); + +foreach $argnum ( 3 .. $#ARGV ) { + + print STDERR "Processing " . $ARGV[$argnum] . "\n"; + + my $batch = MARC::Batch->new('XML',$ARGV[$argnum]); + $batch->strict_off(); + $batch->warnings_off(); + + my $count = 0; + + while ( my $record = $batch->next() ) { + + $count++; + + my $id = $record->field($id_tag); + if (!$id) { + print STDERR "ERROR: This record is missing a $id_tag field.\n" . $record->as_formatted() . "\n=====\n"; + next; + } + $id = $id->as_string($id_subfield); + + if (! defined $record_ids{ $id }) { + print STDOUT '=-' x 39 . "\n"; + print STDOUT $record->as_formatted() . "\n"; + } + } + print STDERR "Processed $count records.\n"; +} diff --git a/fingerprints.pl b/fingerprints.pl new file mode 100755 index 0000000..a7d43ee --- /dev/null +++ b/fingerprints.pl @@ -0,0 +1,156 @@ +#!/usr/bin/perl +use MARC::Batch; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Field; +use Unicode::Normalize; + +my $count = 0; +my $which = $ARGV[0]; +my $id_tag = $ARGV[1]; my $id_subfield = $ARGV[2]; + +binmode(STDOUT, ':utf8'); +binmode(STDIN, ':utf8'); + +foreach $argnum ( 3 .. $#ARGV ) { + + print STDERR "Processing " . $ARGV[$argnum] . "\n"; + + my $batch = MARC::Batch->new('XML',$ARGV[$argnum]); + $batch->strict_off(); + $batch->warnings_off(); + + while ( my $record = $batch->next() ) { + + $count++; + + my $id = $record->field($id_tag); + if (!$id) { + print STDERR "ERROR: This record is missing a $id_tag field.\n" . $record->as_formatted() . "\n=====\n"; + next; + } + $id = $id->as_string($id_subfield); + print STDERR "WARNINGS: Record id " . $id . " : " . join(":",@warnings) . " : continuing...\n" if ( @warnings ); + + my $leader = $record->leader(); + my $record_type = substr($leader,6,1); + my $bib_lvl = substr($leader,7,1); + + my $my_008 = $record->field('008'); + $my_008 = $my_008->as_string() if ($my_008); + my $date1 = substr($my_008,7,4) if ($my_008); + my $date2 = substr($my_008,11,4) if ($my_008); + my $item_form; + if ( $record_type =~ /[gkroef]/ ) { # MAP, VIS + $item_form = substr($my_008,29,1) if ($my_008); + } else { + $item_form = substr($my_008,23,1) if ($my_008); + } + + my $title = $record->field('245'); + if ( $title ) { $title = $title->subfield('a'); } + + my @isbns = (); + my @isbns_020; if ($record->field('020')) { @isbns_020 = $record->field('020'); } + foreach my $f ( @isbns_020 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } } + my @isbns_024; if ($record->field('024')) { @isbns_024 = $record->field('024'); } + foreach my $f ( @isbns_024 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } } + + my $issn = $record->field('022'); + if ( $issn ) { $issn = $issn->subfield('a'); } + my $lccn = $record->field('010'); + if ( $lccn ) { $lccn = $lccn->subfield('a'); } + my $author; + if ($record->field('100')) { $author = $record->field('100')->subfield('a'); } + if (! $author ) { + if ($record->field('110')) { $author = $record->field('110')->subfield('a'); } + } + if (! $author ) { + if ($record->field('111')) { $author = $record->field('111')->subfield('a'); } + } + my $desc = $record->field('300'); + if ( $desc ) { $desc = $desc->subfield('a'); } + my $pages; + if ($desc =~ /(\d+)/) { $pages = $1; } + my $my_260 = $record->field('260'); + my $publisher = $my_260->subfield('b') if ( $my_260 ); + my $pubyear = $my_260->subfield('c') if ( $my_260 ); + if ( $pubyear ) { + if ( $pubyear =~ /(\d\d\d\d)/ ) { $pubyear = $1; } else { $pubyear = ''; } + } + my $edition = $record->field('250'); + if ( $edition ) { $edition = $edition->subfield('a'); } + + # NORMALIZE + if ($record_type == ' ') { $record_type = 'a'; } + if ($title) { + $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; + $title = lc($title); + $title =~ s/\W+$//go; + } + if ($author) { + $author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go; + $author = lc($author); + $author =~ s/\W+$//go; + if ($author =~ /^(\w+)/) { + $author = $1; + } + } + if ($publisher) { + $publisher = NFD($publisher); $publisher =~ s/[\x{80}-\x{ffff}]//go; + $publisher = lc($publisher); + $publisher =~ s/\W+$//go; + if ($publisher =~ /^(\w+)/) { + $publisher = $1; + } + } + + # SPIT OUT FINGERPRINTS FROM THE "LOIS ALGORITHM" + # If we're not getting good matches, we may want to change this. The same thing goes for some other fields. + if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title) { + + if ($which eq "primary") { + print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title) . "\n"; + } else { + + # case a : isbn and pages + if (scalar(@isbns)>0 && $pages) { + foreach my $isbn ( @isbns ) { + print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$isbn,$pages) . "\n"; + } + } + + # case b : edition + if ($edition) { + print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$edition) . "\n"; + } + + # case c : issn + if ($issn) { + print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$issn) . "\n"; + } + + # case d : lccn + if ($lccn) { + print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$lccn) . "\n"; + } + + # case e : author, publisher, pubyear, pages + if ($author && $publisher && $pubyear && $pages) { + print STDOUT join("\t",$id,"case e",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pages) . "\n"; + } + + } + + } else { + print STDERR "Record " . $id . " did not make the cut: "; + print STDERR "Missing item_form. " unless ($item_form); + print STDERR "Missing valid date1. " unless ($date1 =~ /\d\d\d\d/); + print STDERR "Missing record_type. " unless ($record_type); + print STDERR "Missing bib_lvl. " unless ($bib_lvl); + print STDERR "Missing title. " unless ($title); + print STDERR "\n"; + + } + } + print STDERR "Processed $count records\n"; +} diff --git a/match_fingerprints.pl b/match_fingerprints.pl new file mode 100755 index 0000000..a306640 --- /dev/null +++ b/match_fingerprints.pl @@ -0,0 +1,119 @@ +#!/usr/bin/perl + +my $dataset = $ARGV[0]; + +my $match_to = $ARGV[1]; +my $match_these = $ARGV[2]; +my $match_to_score = $ARGV[3]; +my $match_these_score = $ARGV[4]; + +print "match_to: $match_to match_these: $match_these\n"; + +my %pines; +my %incoming; +my %match; +my %candidate_match; +my %score; + +open FILE, $match_to; +while (my $line = ) { + chomp $line; + my @fields = split(/\t/,$line); + my $id = shift @fields; + my $fp = join '^', @fields; + if (! defined $pines{ $fp }) { $pines{ $fp } = []; } + push @{ $pines{ $fp } }, $id; +} +close FILE; + +open FILE, $match_these; +while (my $line = ) { + chomp $line; + my @fields = split(/\t/,$line); + my $id = shift @fields; + my $fp = join '^', @fields; + if (! defined $incoming{ $fp }) { $incoming{ $fp } = []; } + push @{ $incoming{ $fp } }, $id; +} +close FILE; + +foreach my $file ( $match_to_score, $match_from_score ) { + open FILE, $file; + while (my $line = ) { + chomp $line; + my @fields = split(/\|/,$line); + my $id = shift @fields; $id =~ s/\D//g; + my $holdings = shift @fields; $holdings =~ s/\D//g; + my $subtitle = shift @fields; $subtitle =~ s/^\s+//; $subtitle =~ s/\s+$//; + $score{ $id } = [ $holdings, $subtitle ]; + } + close FILE; +} + +open RECORD_IDS, ">match.record_ids"; +foreach my $fp ( keys %incoming ) { + + if (defined $pines{ $fp }) { # match! + + foreach my $id ( @{ $incoming{ $fp } } ) { + + print RECORD_IDS "$id\n"; + if ( ! defined $candidate_match{ $id } ) { $candidate_match{ $id } = []; } + push @{ $candidate_match{ $id } }, $fp; + } + } +} +close RECORD_IDS; + +foreach my $id ( keys %candidate_match ) { + + my $subtitle; if (defined $score{ $id }) { $subtitle = $score{ $id }[1]; } + + my @fps = @{ $candidate_match{ $id } }; + my @candidate_pines = (); + + my $subtitle_matched = 0; + my $highest_holdings = 0; + my $best_pines_id; + + foreach my $fp ( @fps ) { + foreach my $pines_id ( @{ $pines{ $fp } } ) { + my $pines_subtitle; if (defined $score{ $pines_id }) { $pines_subtitle = $score{ $pines_id }[1]; } + my $pines_holdings; if (defined $score{ $pines_id }) { $pines_holdings = $score{ $pines_id }[0]; } + if ($pines_subtitle eq $subtitle) { + if (! $subtitle_matched) { + $subtitle_matched = 1; + $best_pines_id = $pines_id; + $highest_holdings = -1; + } + } else { + if ($subtitle_matched) { next; } + } + if ( $pines_holdings > $highest_holdings ) { + $highest_holdings = $pines_holdings; + $best_pines_id = $pines_id; + } + } + } + print RECORD_IDS "$best_pines_id\n"; + if (! defined $match{ $best_pines_id } ) { $match{ $best_pines_id } = [ $best_pines_id ]; } + push @{ $match{ $best_pines_id } }, $id; +} + + + +open GROUPINGS, ">match.groupings"; +foreach my $k ( keys %match ) { + + print GROUPINGS join("^", + "checking", + $dataset, + $match{ $k }[0], + join(",",@{ $match{ $k } }), + join(",",@{ $match{ $k } }) + ) . "\n"; + +} +close GROUPINGS; + + diff --git a/query_for_primary_matching_incumbent_record.pl b/query_for_primary_matching_incumbent_record.pl new file mode 100755 index 0000000..ee504dd --- /dev/null +++ b/query_for_primary_matching_incumbent_record.pl @@ -0,0 +1,47 @@ +#!/usr/bin/perl +use DBI; +use Data::Dumper; + +################ THIS RESOURCE IS FOR PINES PRODUCTION +my $SOURCE_DBI_RESOURCE = "dbi:Pg:dbname=sparkle;host=10.1.0.12;port=5432"; +my $SOURCE_DBI_USER = 'postgres'; +my $SOURCE_DBI_PASSWD = ''; +my $source_dbh = DBI->connect($SOURCE_DBI_RESOURCE, $SOURCE_DBI_USER, $SOURCE_DBI_PASSWD) or die("Database error: $DBI::errstr"); +my $primary_fingerprint_tablename = "public.quitman_full_fingerprint_set"; + +sub fetch_record { + + my $item_form = shift; + my $date1 = shift; + my $record_type = shift; + my $bib_lvl = shift; + my $title = shift; + my $sql = "select id from $primary_fingerprint_tablename where " . join(' AND ', + " item_form = ".$source_dbh->quote($item_form), + " substring = ".$source_dbh->quote($date1), + " item_type = ".$source_dbh->quote($record_type), + " bib_level = ".$source_dbh->quote($bib_lvl), + " title = ".$source_dbh->quote($title), + ); + my $source_sth = $source_dbh->prepare($sql) or die("prepare error: $DBI::errstr \n[$sql]"); + $source_sth->execute() or die("execute error: $DBI::errstr \n[$sql]"); + + while ( my ($id) = $source_sth->fetchrow_array ) { + + print "$id\n"; + + } + $source_sth->finish(); + + +} + +while (my $line = <>) { + chomp $line; + my ($id,$item_form,$date1,$record_type,$bib_lvl,$title) = split(/\t/,$line); + if ($id eq 'id') { next; } + fetch_record($item_form,$date1,$record_type,$bib_lvl,$title); +} + +$source_dbh->disconnect; + diff --git a/select_marc.pl b/select_marc.pl new file mode 100755 index 0000000..152bd1c --- /dev/null +++ b/select_marc.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl +use MARC::Batch; +use MARC::Record; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Field; + +my $record_id_file = $ARGV[0]; +my %record_ids; + +open FILE, $record_id_file; +while (my $record_id = ) { + chomp($record_id); $record_ids{ $record_id } = 1; +} +close FILE; + +my $id_tag = $ARGV[1]; my $id_subfield = $ARGV[2]; + +binmode(STDOUT, ':utf8'); +binmode(STDIN, ':utf8'); + +foreach $argnum ( 3 .. $#ARGV ) { + + print STDERR "Processing " . $ARGV[$argnum] . "\n"; + + my $batch = MARC::Batch->new('XML',$ARGV[$argnum]); + $batch->strict_off(); + $batch->warnings_off(); + + my $count = 0; + + while ( my $record = $batch->next() ) { + + $count++; + + my $id = $record->field($id_tag); + if (!$id) { + print STDERR "ERROR: This record is missing a $id_tag field.\n" . $record->as_formatted() . "\n=====\n"; + next; + } + $id = $id->as_string($id_subfield); + + if (defined $record_ids{ $id }) { + open FILE, ">$id"; + binmode(FILE, ':utf8'); + print FILE $record->as_xml(); + close FILE; + } + } + print STDERR "Processed $count records.\n"; +} diff --git a/set_record_ids.pl b/set_record_ids.pl new file mode 100755 index 0000000..911ed10 --- /dev/null +++ b/set_record_ids.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl +use MARC::Batch; +use MARC::Record; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Field; + +my $initial_record_number = $ARGV[0]; +my $record_tag_number = $ARGV[1]; +my $record_tag_subfield = $ARGV[2]; +my $count = 0; + +binmode(STDOUT, ':utf8'); +binmode(STDIN, ':utf8'); + +foreach $argnum ( 3 .. $#ARGV ) { + + print STDERR "Processing " . $ARGV[$argnum] . ", starting record id at $initial_record_number\n"; + + my $batch = MARC::Batch->new('XML',$ARGV[$argnum]); + #$batch->strict_off(); + #$batch->warnings_off(); + + while ( my $record = $batch->next() ) { + + $count++; + + print STDERR "WARNINGS: Record $count : " . join(":",@warnings) . " : continuing...\n" if ( @warnings ); + + while ($record->field($record_tag_number)) { $record->delete_field( $record->field($record_tag_number) ); } + my $new_id = $initial_record_number + $count - 1; + my $new_id_field = MARC::Field->new( $record_tag_number, ' ', ' ', $record_tag_subfield => $new_id); + $record->append_fields($new_id_field); + + print $record->as_xml(); + } + + print STDERR "Processed $count records. Last record id at " . ($initial_record_number + $count - 1) . "\n"; +} diff --git a/trim_marc_based_on_tag_subfield_value.pl b/trim_marc_based_on_tag_subfield_value.pl new file mode 100755 index 0000000..c8f1925 --- /dev/null +++ b/trim_marc_based_on_tag_subfield_value.pl @@ -0,0 +1,47 @@ +#!/usr/bin/perl +use MARC::Batch; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Field; +use Unicode::Normalize; + + +my $tag_number = $ARGV[0]; +my $tag_subfield = $ARGV[1]; +my $tag_value = $ARGV[2]; + +my $count = 0; + +binmode(STDOUT, ':utf8'); +binmode(STDIN, ':utf8'); + +foreach $argnum ( 3 .. $#ARGV ) { + + print STDERR "Processing " . $ARGV[$argnum] . "\n"; + + my $batch = MARC::Batch->new('XML',$ARGV[$argnum]); + $batch->strict_off(); + $batch->warnings_off(); + + while ( my $record = $batch->next() ) { + + $count++; + + print STDERR "WARNINGS: Record $count : " . join(":",@warnings) . " : continuing...\n" if ( @warnings ); + + my $keep_me = 0; + + my @tags = (); + my @tags; if ($record->field($tag_number)) { @tags = $record->field($tag_number); } + foreach my $f ( @tags ) { + if ($f->subfield($tag_subfield)) { + if ( $f->subfield($tag_subfield)=~ m/($tag_value)/i ) { $keep_me = 1; } + } + } + + if ($keep_me) { + print STDOUT $record->as_xml(); + } + + } + print STDERR "Processed $count records\n"; +}