+++ /dev/null
- Migration Data Work HOWTO / Toolkit
-========================================================================
-The following is for migrating into an existing system like PINES:
-
-Get the incoming bib data, and translate to UTF-8 MARCXML. It may
-contain holdings. It may contain XML or MARC errors that you have to
-sanitize before your tools will work. This is one way to translate
-MARC-8 MARC21 to UTF-8 MARCXML:
-
- yaz-marcdump -f MARC-8 -t UTF-8 -o marcxml \
- incoming.marc > incoming.marc.xml
-
-If you need to trim the bibs to a subset based on the presence of a
-certain value in a specific tag/subfield (for example, if you have the
-bibs for all libraries in a foreign system and only need bibs
-belonging to a specific migrating library, you might filter based on
-their holding tags)
-
- trim_marc_based_on_tag_subfield_value.pl 999 m BRANCH_CODE \
- incoming.marc.xml > incoming.filtered.marc.xml
-
-Embed potential native record ids into the incumbent records
-
- renumber_marc -rf 100000 -t 903 -s a -o incoming.renumbered.marc.xml \
- incoming.marc.xml
-
-Get primary fingerprints for incoming data and get a bib dump of
-matching records from the incumbent system
-
- fingerprinter -r primary -t 903 -s a -o incoming.primary.fp \
- -x incoming.primary.ex incoming.renumbered.mrc.xml
-
-#Edit the query_for_primary_matching_incumbent_record.pl script to
-#point to the correct Evergreen database and table holding the
-#incumbent primary fingerprints (FIXME add in how to create such a
-#table).
-#
-# query_for_primary_matching_incumbent_record.pl incoming.primary.fp \
-# | sort | uniq > primary_matching_incumbent.record_ids
-#
-#In a postgres shell, you create a temporary table to hold these id's:
-#
-# CREATE TABLE primary_matching_incumbent_records_for_incoming_library
-# (id BIGINT);
-# COPY primary_matching_incumbent_records_for_incoming_library
-# FROM 'primary_matching_incumbent.record_ids';
-#
-#To dump the matching incumbent records to a file, in a postgres shell
-#do:
-#
-# matching_incumbent_records.dump SELECT b.id, b.tcn_source, b.tcn_value,
-# regexp_replace(b.marc,E'\n','','g')
-# FROM biblio.record_entry AS b
-# JOIN primary_matching_incumbent_records_for_incoming_library
-# AS c using ( id );
-#
-#Now to turn that dump into a MARCXML file with record numbers and TCN
-#embedded in tag 901, do:
-#
-# marc_add_ids -f id -f tcn_source -f tcn_value -f marc \
-# < matching_incumbent_records.dump > matching_incumbent_records.marc.xml
-#
-#It's possible that this file may need to be itself sanitized some.
-#This will transform code=""" into code="&x0022;", for example:
-#
-# cat matching_incumbent_records.marc.xml | \
-# sed 's/code=\"\"\"/code=\"\"\"/' \
-# > matching_incumbent_records.escaped.mrc.xml
-
-Get full fingerprints for both datasets and match them.
-
- fingerprinter -r full -t 901 -s c -o incumbent.fp -x incumbent.ex \
- matching_incumbent_records.marc.xml
- fingerprinter -r full -t 903 -s a -o incoming.fp -x incoming.ex \
- incoming.renumbered.marc.xml
-
-The script below will produce matched groupings, and can optionally
-take a 4th and 5th parameter providing scoring information for
-determining lead records. In the past, this would consider certain
-metrics for MARC quality, but in the latest incarnation, it assumes an
-incumbent record will be the lead record, and looks at # of holdings
-and possible matching of tag 245 subfield b for determining which of
-the incumbent records would be the lead record. The example
-invocation below does not use scoring.
-
- match_fingerprints.pl "name of dataset for dedup interface" \
- incumbent.fp incoming.fp
-
-This will produce two files, match.groupings and match.record_ids.
-The format for match.groupings is suitable for insertion into the db
-for the dedup interface.
-
-Import these matches and records into the legacy dedup interface for viewing:
-
-Now to tar up the specific MARC records involved for the dedup interface:
-
- cat match.groupings | cut -d^ -f3 > incumbent.record_ids
- cat match.groupings | cut -d^ -f5 | cut -d, -f2- | sed 's/,/\n/g' \
- > incoming.record_ids
-
-# mkdir dataset ; cd dataset
-# select_marc.pl ../incumbent.record_ids 901 c \
-# ../matching_incumbent_records.mrc.xml
-# select_marc.pl ../incoming.record_ids 903 a \
-# ../incoming.renumbered.mrc.xml
-# cd ..
-# tar cvf dataset.tar dataset
-#
-#In a mysql shell for the database used with the dedup interface:
-#
-# LOAD DATA LOCAL INFILE 'match.groupings' INTO TABLE record_group
-# FIELDS TERMINATED BY '^'
-# ( status, dataset, best_record,records,original_records );
-#
-#Create a pretty printed text dump of the non-matching incoming records:
-#
-# dump_inverse_select_marc.pl incoming.record_ids 903 a \
-# incoming.renumbered.mrc.xml > non_matching_incoming.mrc.txt 2> \
-# non_matching_incoming.mrc.txt.err
-
-marc2bre.pl --idfield=903 --dontuse=live_tcns.txt -f
-quitman_non_matching_incoming.mrc.xml -f
-catoosa_non_matching_incoming.mrc.xml --marctype=XML > some.bre
-
-direct_ingest.pl < some.bre > some.ingest
-
-perl pg_loader.pl -or bre -or mrd -or mfr -or mtfe -or mafe -or msfe
--or mkfe -or msefe -a mrd -a mfr -a mtfe -a mafe -a msfe -a mkfe -a
-msefe < ~/gutenberg.ingest > ~/gutenberg.sql
+++ /dev/null
-# One-linerize an XML (so that it can be split with 'split -l', for example)
-perl -e 'my $line="";while(<>){chomp;$line.=$_;if($_ eq "</record>"){$line=~s/>\s+</></go;print("$line\n");$line="";}}'
-# test 234
-# Fix attribute values of \" that fall inside of \"'s.
-for i in a b c; do sed 's/"""/" "/g' < pines-marca$i > pines-marca$i.xml; done
+++ /dev/null
-#!/usr/bin/perl
-use open ':utf8';
-use MARC::Batch;
-use MARC::File::XML ( BinaryEncoding => 'utf-8' );
-use MARC::Field;
-use Unicode::Normalize;
-
-my $count = 0;
-
-binmode(STDOUT, ':utf8');
-binmode(STDIN, ':utf8');
-
-print join("\t",
- "bib id",
- "library",
- "barcode",
- "current location",
- "home location",
- "call number",
- "item type",
- "acq date",
- "price",
- "circulate flag",
- "total charges",
- "cat1",
- "cat2"
-) . "\n";
-
-foreach my $argnum ( 0 .. $#ARGV ) {
-
- print STDERR "Processing " . $ARGV[$argnum] . "\n";
-
- my $M;
- open $M, '<:utf8', $ARGV[$argnum];
- my $batch = MARC::Batch->new('XML',$M);
-
- $batch->strict_off();
- $batch->warnings_off();
-
- while ( my $record = $batch->next() ) {
-
- $count++;
-
- print STDERR "WARNINGS: Record $count : " . join(":",@warnings) . " : continuing...\n" if ( @warnings );
- my $my_903a = $record->field('903')->subfield('a'); # target bib id's here
- my @tags = $record->field('999');
- foreach my $tag ( @tags ) {
- print join("\t",
- $my_903a,
- $tag->subfield('m') || '', # library
- $tag->subfield('i') || '', # barcode
- $tag->subfield('k') || '', # current location
- $tag->subfield('l') || '', # home location
- $tag->subfield('a') || '', # call number
- $tag->subfield('t') || '', # item type
- $tag->subfield('u') || '', # acq date
- $tag->subfield('p') || '', # price
- $tag->subfield('r') || '', # circulate flag
- $tag->subfield('n') || '', # total charges
- $tag->subfield('x') || '', # cat1
- $tag->subfield('z') || '' # cat2
- ) . "\n";
- }
-
- }
- print STDERR "Processed $count records\n";
-}
+++ /dev/null
-#!/usr/bin/perl -w
-
-my $count = 0;
-my %records = ();
-
-sub print_line {
- print join("\t",
- $records{ $count }{'FORM'} || '',
- $records{ $count }{'USER_ID'} || '',
- $records{ $count }{'ITEM_ID'} || '',
- $records{ $count }{'BILL_LIBRARY'} || '',
- $records{ $count }{'BILL_DB'} || '',
- $records{ $count }{'BILL_AMOUNT'} || '',
- $records{ $count }{'BILL_REASON'} || '',
- ) . "\n";
-}
-
-print "FORM\tUSER_ID\tITEM_ID\tBILL_LIBRARY\tBILL_DB\tBILL_AMOUNT\tBILL_REASON\n";
-
-while (my $line = <>) {
- chomp $line; $line =~ s/[\r\n]//g;
- if ($line =~ /DOCUMENT BOUNDARY/) {
- if (defined $records{ $count }) {
- print_line();
- }
- $count++; $records{ $count } = {};
- }
- if ($line =~ /FORM=(.+)/) {
- $records{ $count }{'FORM'} = $1;
- }
- if ($line =~ /\.USER_ID\..+\|a(.+)/) {
- $records{ $count }{'USER_ID'} = $1;
- }
- if ($line =~ /\.ITEM_ID\..+\|a(.+)/) {
- $records{ $count }{'ITEM_ID'} = $1;
- }
- if ($line =~ /\.BILL_LIBRARY\..+\|a(.+)/) {
- $records{ $count }{'BILL_LIBRARY'} = $1;
- }
- if ($line =~ /\.BILL_DB\..+\|a(.+)/) {
- $records{ $count }{'BILL_DB'} = $1;
- }
- if ($line =~ /\.BILL_AMOUNT\..+\|a(.+)/) {
- $records{ $count }{'BILL_AMOUNT'} = $1;
- }
- if ($line =~ /\.BILL_REASON\..+\|a(.+)/) {
- $records{ $count }{'BILL_REASON'} = $1;
- }
-}
-print_line();
-
+++ /dev/null
-#!/usr/bin/perl -w
-
-my $count = 0;
-my %records = ();
-
-sub print_line {
- print join("\t",
- $records{ $count }{'FORM'} || '',
- $records{ $count }{'USER_ID'} || '',
- $records{ $count }{'ITEM_ID'} || '',
- $records{ $count }{'CHRG_LIBRARY'} || '',
- $records{ $count }{'CHRG_DC'} || '',
- $records{ $count }{'CHRG_DATEDUE'} || '',
- $records{ $count }{'CHRG_DATE_CLMRET'} || '',
- ) . "\n";
-}
-
-print "FORM\tUSER_ID\tITEM_ID\tCHRG_LIBRARY\tCHRG_DC\tCHRG_DATEDUE\tCHRG_DATE_CLMRET\n";
-
-while (my $line = <>) {
- chomp $line; $line =~ s/[\r\n]//g;
- if ($line =~ /DOCUMENT BOUNDARY/) {
- if (defined $records{ $count }) {
- print_line();
- }
- $count++; $records{ $count } = {};
- }
- if ($line =~ /FORM=(.+)/) {
- $records{ $count }{'FORM'} = $1;
- }
- if ($line =~ /\.USER_ID\..+\|a(.+)/) {
- $records{ $count }{'USER_ID'} = $1;
- }
- if ($line =~ /\.ITEM_ID\..+\|a(.+)/) {
- $records{ $count }{'ITEM_ID'} = $1;
- }
- if ($line =~ /\.CHRG_LIBRARY\..+\|a(.+)/) {
- $records{ $count }{'CHRG_LIBRARY'} = $1;
- }
- if ($line =~ /\.CHRG_DC\..+\|a(.+)/) {
- $records{ $count }{'CHRG_DC'} = $1;
- }
- if ($line =~ /\.CHRG_DATEDUE\..+\|a(.+)/) {
- $records{ $count }{'CHRG_DATEDUE'} = $1;
- }
- if ($line =~ /\.CHRG_DATE_CLMRET\..+\|a(.+)/) {
- $records{ $count }{'CHRG_DATE_CLMRET'} = $1;
- }
-}
-print_line();
-
+++ /dev/null
-#!/usr/bin/perl
-use strict;
-use warnings;
-
-use DateTime;
-use Time::HiRes qw/time/;
-use XML::LibXML;
-
-my %s_map;
-
-my $doc = XML::LibXML->new->parse_file($ARGV[0]);
-
-my $starttime = time;
-my $count = 1;
-
-my @base_elements = (
- "user_id",
- "user_altid",
- "user_pin",
- "user_profile",
- "user_status",
- "user_library",
- "user_priv_granted",
- "user_priv_expires",
- "user_mailingaddr",
- "birthdate",
- "last_name",
- "first_name",
- "middle_name",
- "suffix_name",
- "note",
- "note1",
- "comment",
- "staff",
- "webcatpref",
- "user_category1",
- "user_category2",
- "user_category3",
- "user_category4",
- "dept",
- "guardian",
- "license",
- "aup",
- "photo",
- "notify_via",
- "user_claims_ret",
- "user_environment",
- "user_department",
- "ums_id"
-);
-
-my @addr_elements = (
- "std_line1",
- "std_line2",
- "std_city",
- "std_state",
- "std_zip",
- "phone",
- "dayphone",
- "homephone",
- "workphone",
- "cellphone",
- "email",
- "location",
- "usefor",
- "care_of",
- "known_bad"
-);
-
-print STDOUT join("\t", @base_elements);
-foreach my $addr ( 1..3 ) {
- print STDOUT "\t" . join("\t", @addr_elements);
-}
-print STDOUT "\tuserid_active\tinactive_barcode1\tinactive_barcode2";
-print STDOUT "\n";
-
-for my $patron ( $doc->documentElement->childNodes ) {
- next if ($patron->nodeType == 3);
-
- my $bc = $patron->findvalue( 'user_id' ); $bc =~ s/^\s+//; $bc =~ s/\s+$//;
- if (exists($s_map{$bc})) {
- $count++;
- warn "\n!!! already saw barcode $bc, skipping\n";
- next;
- } else {
- $s_map{$bc} = 1;
- }
-
- unless (defined($bc)) {
- my $xml = $patron->toString;
- warn "\n!!! no barcode found in UMS data, user number $count, xml => $xml \n";
- $count++;
- next;
- }
-
- foreach my $e ( @base_elements ) {
- my $v = $patron->findvalue( $e ); $v =~ s/^\s+//; $v =~ s/\s+$//;
- if ( $v && ( $e eq 'birthdate' || $e eq 'user_priv_granted' || $e eq 'user_priv_expires' ) ) { $v = parse_date($v); }
- print STDOUT ( $v ? $v : '' ) . "\t";
- }
-
- my %addresses;
-
- for my $addr ( $patron->findnodes( "Address" ) ) {
- my $addr_type = $addr->getAttribute('addr_type');
- $addresses{$addr_type} = $addr;
- }
-
- foreach my $t ( 1..3 ) {
- if ($addresses{$t}) {
- foreach my $e ( @addr_elements ) {
- my $v;
- if ($e eq "known_bad") {
- $v = $addresses{$t}->getAttribute( $e ); if ($v) { $v =~ s/^\s+//; $v =~ s/\s+$//; }
- } else {
- $v = $addresses{$t}->findvalue( $e ); $v =~ s/^\s+//; $v =~ s/\s+$//;
- }
- print STDOUT ( $v ? $v : '' ) . "\t";
- }
- } else {
- foreach ( @addr_elements ) { print STDOUT "\t"; }
- }
- }
-
- my $inactive_barcode1 = '';
- my $inactive_barcode2 = '';
- my $userid_active = 't';
- my @barcodes = $patron->findnodes( "barcodes" );
- for my $i_bc ( $barcodes[0]->findnodes( "barcode" ) ) {
- my $active = $i_bc->getAttribute('active');
- if ($active eq "0" && $i_bc->textContent eq $bc) {
- $userid_active = 'f';
- }
- if ($active eq "0" && $i_bc->textContent ne $bc) {
- if (! $inactive_barcode1 ) {
- $inactive_barcode1 = $i_bc->textContent;
- $inactive_barcode1 =~ s/^\s+//;
- $inactive_barcode1 =~ s/\s+$//;
- } else {
- if (! $inactive_barcode2 ) {
- $inactive_barcode2 = $i_bc->textContent;
- $inactive_barcode2 =~ s/^\s+//;
- $inactive_barcode2 =~ s/\s+$//;
- } else {
- warn "Extra barcode (" . $i_bc->textContent . ") for user with id = " . $bc . "\n";
- }
- }
- }
- }
- print STDOUT "$userid_active\t$inactive_barcode1\t$inactive_barcode2";
-
- print STDOUT "\n";
- $count++;
-}
-
-sub parse_date {
- my $string = shift;
- my $group = shift;
-
- my ($y,$m,$d);
-
- if ($string eq 'NEVER') {
- my (undef,undef,undef,$d,$m,$y) = localtime();
- return sprintf('%04d-%02d-%02d', $y + 1920, $m + 1, $d);
- } elsif (length($string) == 8 && $string =~ /^(\d{4})(\d{2})(\d{2})$/o) {
- ($y,$m,$d) = ($1,$2,$3);
- } elsif ($string =~ /(\d+)\D(\d+)\D(\d+)/o) { #looks like it's parsable
- if ( length($3) > 2 ) { # looks like mm.dd.yyyy
- if ( $1 < 99 && $2 < 99 && $1 > 0 && $2 > 0 && $3 > 0) {
- if ($1 > 12 && $1 < 31 && $2 < 13) { # well, actually it looks like dd.mm.yyyy
- ($y,$m,$d) = ($3,$2,$1);
- } elsif ($2 > 12 && $2 < 31 && $1 < 13) {
- ($y,$m,$d) = ($3,$1,$2);
- }
- }
- } elsif ( length($1) > 3 ) { # format probably yyyy.mm.dd
- if ( $3 < 99 && $2 < 99 && $1 > 0 && $2 > 0 && $3 > 0) {
- if ($2 > 12 && $2 < 32 && $3 < 13) { # well, actually it looks like yyyy.dd.mm -- why, I don't konw
- ($y,$m,$d) = ($1,$3,$2);
- } elsif ($3 > 12 && $3 < 31 && $2 < 13) {
- ($y,$m,$d) = ($1,$2,$3);
- }
- }
- } elsif ( $1 < 99 && $2 < 99 && $3 < 99 && $1 > 0 && $2 > 0 && $3 > 0) {
- if ($3 < 7) { # probably 2000 or greater, mm.dd.yy
- $y = $3 + 2000;
- if ($1 > 12 && $1 < 32 && $2 < 13) { # well, actually it looks like dd.mm.yyyy
- ($m,$d) = ($2,$1);
- } elsif ($2 > 12 && $2 < 32 && $1 < 13) {
- ($m,$d) = ($1,$2);
- }
- } else { # probably before 2000, mm.dd.yy
- $y = $3 + 1900;
- if ($1 > 12 && $1 < 32 && $2 < 13) { # well, actually it looks like dd.mm.yyyy
- ($m,$d) = ($2,$1);
- } elsif ($2 > 12 && $2 < 32 && $1 < 13) {
- ($m,$d) = ($1,$2);
- }
- }
- }
- }
-
- my $date = $string;
- if ($y && $m && $d) {
- eval {
- $date = sprintf('%04d-%02d-%-2d',$y, $m, $d)
- if (new DateTime ( year => $y, month => $m, day => $d ));
- }
- }
-
- return $date;
-}
-