#!/usr/bin/perl # Copyright (c) 2016 Equinox Software, Inc. # Author: Galen Charlton # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see use strict; use warnings; use Getopt::Long; use MARC::Record; use MARC::File::XML (BinaryEncoding => 'utf8'); use DBI; use OpenILS::Application::AppUtils; my $action; my $schema = 'bib_loads'; my $db; my $dbuser; my $dbpw; my $dbhost; my $batch; my $cutoff; my $wait = 1; my $ret = GetOptions( 'action:s' => \$action, 'schema:s' => \$schema, 'db:s' => \$db, 'dbuser:s' => \$dbuser, 'dbhost:s' => \$dbhost, 'dbpw:s' => \$dbpw, 'batch:s' => \$batch, 'cutoff:s' => \$cutoff, 'wait:i' => \$wait, ); abort('must specify --action') unless defined $action; abort('must specify --schema') unless defined $schema; abort('must specify --db') unless defined $db; abort('must specify --dbuser') unless defined $dbuser; abort('must specify --dbhost') unless defined $dbhost; abort('must specify --dbpw') unless defined $dbpw; abort('must specify --batch') unless defined $batch; abort('--action must be "stage_bibs" or "filter_bibs" or "load_bibs"') unless $action eq 'filter_bibs' or $action eq 'stage_bibs' or $action eq 'load_bibs'; my $dbh = connect_db($db, $dbuser, $dbpw, $dbhost); if ($action eq 'stage_bibs') { abort('must specify at least one input file') unless @ARGV; handle_stage_bibs($dbh, $schema, $batch); } if ($action eq 'filter_bibs') { abort('must specify cutoff date when filtering') unless defined $cutoff; handle_filter_bibs($dbh, $schema, $batch, $cutoff); } if ($action eq 'load_bibs' ) { handle_load_bibs($dbh, $schema, $batch, $wait); } sub abort { my $msg = shift; print STDERR "$0: $msg", "\n"; print_usage(); exit 1; } sub print_usage { print <<_USAGE_; Utility to stage and overlay bib records in an Evergreen database. This expects that the incoming records will have been previously exported from that Evergreen database and modified in some fashion (e.g., for authority record processing) and that the bib ID can be found in the 901\$c subfield. This program has several modes controlled by the --action switch: --action stage_bibs - load MARC bib records into a staging table --action filter_bibs - mark previously staged bibs that should be excluded from a subsequent load, either because the target bib is deleted in Evergreen or the record was modified after a date specified by the --cutoff switch --action load_bibs - overlay bib records using a previously staged batch, one at a time. After each bib, it will wait the number of seconds specified by the --wait switch. Several switches are used regardless of the specified action: --schema - Pg schema in which staging table will live; should be created beforehand --batch - name of bib batch; will also be used as the name of the staging table --db - database name --dbuser - database user --dbpw - database password --dbhost - database host Examples: $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\ --dbuser evergreen --dbpw evergreen --dbhost localhost \\ --action stage_bibs -- file1.mrc file2.mrc [...] $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\ --dbuser evergreen --dbpw evergreen --dbhost localhost \\ --action filter_bibs --cutoff 2016-01-02 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\ --dbuser evergreen --dbpw evergreen --dbhost localhost \\ --action load_bibs _USAGE_ } sub report_progress { my ($msg, $counter) = @_; if (defined $counter) { print STDERR "$msg: $counter\n"; } else { print STDERR "$msg\n"; } } sub connect_db { my ($db, $dbuser, $dbpw, $dbhost) = @_; my $dsn = "dbi:Pg:host=$dbhost;dbname=$db;port=5432"; my $attrs = { ShowErrorStatement => 1, RaiseError => 1, PrintError => 1, pg_enable_utf8 => 1, }; my $dbh = DBI->connect($dsn, $dbuser, $dbpw, $attrs); return $dbh; } sub handle_stage_bibs { my $dbh = shift; my $schema = shift; my $batch = shift; $dbh->do(qq{ DROP TABLE IF EXISTS $schema.$batch; }); $dbh->do(qq{ CREATE TABLE $schema.$batch ( id SERIAL, marc TEXT, bib_id BIGINT, imported BOOLEAN DEFAULT FALSE, to_import BOOLEAN DEFAULT TRUE, skip_reason TEXT ) }); local $/ = "\035"; my $i = 0; binmode STDIN, ':utf8'; my $ins = $dbh->prepare("INSERT INTO $schema.$batch (marc, bib_id) VALUES (?, ?)"); $dbh->begin_work; while (<>) { $i++; if (0 == $i % 100) { report_progress("Records staged", $i); $dbh->commit; $dbh->begin_work; } my $marc = MARC::Record->new_from_usmarc($_); my $bibid = $marc->subfield('901', 'c'); if ($bibid !~ /^\d+$/) { print STDERR "Record $i is suspect; skipping\n"; next; } my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record()); $ins->execute($xml, $bibid); } $dbh->commit; report_progress("Records staged", $i) if 0 != $i % 100; $dbh->do(qq/ CREATE INDEX ${batch}_bib_id_idx ON $schema.$batch (bib_id); /); $dbh->do(qq/ CREATE INDEX ${batch}_id_idx ON $schema.$batch (id); /); } sub handle_filter_bibs { my $dbh = shift; my $schema = shift; my $batch = shift; my $cutoff = shift; my $sth1 = $dbh->prepare(qq{ UPDATE $schema.$batch SET to_import = FALSE, skip_reason = 'deleted' WHERE bib_id IN ( SELECT id FROM biblio.record_entry WHERE deleted ) }); $sth1->execute(); my $ct = $sth1->rows; report_progress("Filtering out $ct records that are currently deleted"); my $sth2 = $dbh->prepare(qq{ UPDATE $schema.$batch SET to_import = FALSE, skip_reason = 'edited after cutoff of $cutoff' WHERE bib_id IN ( SELECT id FROM biblio.record_entry WHERE edit_date >= ? ) AND to_import; }); $sth2->execute($cutoff); $ct = $sth2->rows; report_progress("Filtering out $ct records edited after cutoff date of $cutoff"); } sub handle_load_bibs { my $dbh = shift; my $schema = shift; my $batch = shift; my $wait = shift; my $getct = $dbh->prepare(qq{ SELECT COUNT(*) FROM $schema.$batch WHERE to_import AND NOT imported }); $getct->execute(); my $max = $getct->fetchrow_arrayref()->[0]; report_progress('Number of bibs to update', $max); for (my $i = 1; $i <= $max; $i++) { report_progress('... bibs updated', $i) if 0 == $i % 10 or $i == $max; $dbh->begin_work; $dbh->do(qq{ UPDATE biblio.record_entry a SET marc = b.marc FROM $schema.$batch b WHERE a.id = b.bib_id AND bib_id IN ( SELECT bib_id FROM $schema.$batch WHERE to_import AND NOT imported ORDER BY id LIMIT 1 ) }); $dbh->do(qq{ UPDATE $schema.$batch SET imported = TRUE WHERE bib_id IN ( SELECT bib_id FROM $schema.$batch WHERE to_import AND NOT imported ORDER BY id LIMIT 1 ) }); $dbh->commit; sleep $wait; } }