3 # Copyright (c) 2016 Equinox Software, Inc.
4 # Author: Galen Charlton <gmc@esilibrary.com>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2, or (at your option)
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
24 use MARC::File::XML (BinaryEncoding => 'utf8');
26 use OpenILS::Application::AppUtils;
29 my $schema = 'bib_loads';
39 'action:s' => \$action,
40 'schema:s' => \$schema,
42 'dbuser:s' => \$dbuser,
43 'dbhost:s' => \$dbhost,
46 'cutoff:s' => \$cutoff,
50 abort('must specify --action') unless defined $action;
51 abort('must specify --schema') unless defined $schema;
52 abort('must specify --db') unless defined $db;
53 abort('must specify --dbuser') unless defined $dbuser;
54 abort('must specify --dbhost') unless defined $dbhost;
55 abort('must specify --dbpw') unless defined $dbpw;
56 abort('must specify --batch') unless defined $batch;
58 abort(q{--action must be "stage_bibs", "filter_bibs", "load_bibs", "stage_auths",
59 "match_auths", "load_new_auths", "overlay_auths_stage1",
60 "overlay_auths_stage2", "overlay_auths_stage3", "link_auth_auth"}) unless
61 $action eq 'filter_bibs' or
62 $action eq 'stage_bibs' or
63 $action eq 'load_bibs' or
64 $action eq 'stage_auths' or
65 $action eq 'match_auths' or
66 $action eq 'load_new_auths' or
67 $action eq 'overlay_auths_stage1' or
68 $action eq 'overlay_auths_stage2' or
69 $action eq 'overlay_auths_stage3' or
70 $action eq 'link_auth_auth' or
71 $action eq 'link_auth_bib'
74 my $dbh = connect_db($db, $dbuser, $dbpw, $dbhost);
76 if ($action eq 'stage_bibs') {
77 abort('must specify at least one input file') unless @ARGV;
78 handle_stage_bibs($dbh, $schema, $batch);
81 if ($action eq 'filter_bibs') {
82 abort('must specify cutoff date when filtering') unless defined $cutoff;
83 handle_filter_bibs($dbh, $schema, $batch, $cutoff);
86 if ($action eq 'load_bibs' ) {
87 handle_load_bibs($dbh, $schema, $batch, $wait);
90 if ($action eq 'stage_auths') {
91 abort('must specify at least one input file') unless @ARGV;
92 handle_stage_auths($dbh, $schema, $batch);
95 if ($action eq 'match_auths') {
96 handle_match_auths($dbh, $schema, $batch);
99 if ($action eq 'load_new_auths') {
100 handle_load_new_auths($dbh, $schema, $batch);
103 if ($action eq 'overlay_auths_stage1') {
104 handle_overlay_auths_stage1($dbh, $schema, $batch);
106 if ($action eq 'overlay_auths_stage2') {
107 handle_overlay_auths_stage2($dbh, $schema, $batch);
109 if ($action eq 'overlay_auths_stage3') {
110 handle_overlay_auths_stage3($dbh, $schema, $batch);
113 if ($action eq 'link_auth_auth') {
114 handle_link_auth_auth($dbh, $schema, $batch);
116 if ($action eq 'link_auth_bib') {
117 handle_link_auth_bib($dbh, $schema, $batch);
122 print STDERR "$0: $msg", "\n";
130 Utility to stage and overlay bib records in an Evergreen database. This
131 expects that the incoming records will have been previously exported
132 from that Evergreen database and modified in some fashion (e.g., for
133 authority record processing) and that the bib ID can be found in the
136 This program has several modes controlled by the --action switch:
138 --action stage_bibs - load MARC bib records into a staging table
139 --action filter_bibs - mark previously staged bibs that should
140 be excluded from a subsequent load, either
141 because the target bib is deleted in Evergreen
142 or the record was modified after a date
143 specified by the --cutoff switch
144 --action load_bibs - overlay bib records using a previously staged
145 batch, one at a time. After each bib, it will
146 wait the number of seconds specified by the
149 --action stage_auths - load MARC authorities into staging
151 --action match_auths - identify matches with authority
152 records already present in the
153 Evergreen database; matching is
154 based on LCCN, cancelled LCCN, and
156 --action load_new_auths - load new (unmatched) authorities
157 --action overlay_auths_stage1 - overlay based on LCCN where
158 heading has NOT changed; this step
159 disables propagation to bib records
160 --action overlay_auths_stage2 - overlay based on LCCN where heading
161 HAS changed; propagates changes
163 --action overlay_auths_stage3 - overlay for records where a cancelled
164 LCCN is replaced with a new one
165 --action link_auth_auth - run authority_authority_linker.pl for
166 the authorities that were overlaid
167 or added in this batch.
168 --action link_auth_bib - run authority_control_fields.pl for
169 the bibs that were overlaid in this
172 Several switches are used regardless of the specified action:
174 --schema - Pg schema in which staging table will live; should be
176 --batch - name of bib batch; will also be used as the name
177 of the staging tables
179 --dbuser - database user
180 --dbpw - database password
181 --dbhost - database host
185 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
186 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
187 --action stage_bibs -- file1.mrc file2.mrc [...]
189 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
190 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
191 --action filter_bibs --cutoff 2016-01-02
193 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
194 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
195 --action load_bibs --wait 2
201 sub report_progress {
202 my ($msg, $counter) = @_;
203 if (defined $counter) {
204 print STDERR "$msg: $counter\n";
206 print STDERR "$msg\n";
211 my ($db, $dbuser, $dbpw, $dbhost) = @_;
213 my $dsn = "dbi:Pg:host=$dbhost;dbname=$db;port=5432";
216 ShowErrorStatement => 1,
221 my $dbh = DBI->connect($dsn, $dbuser, $dbpw, $attrs);
226 sub handle_stage_bibs {
232 DROP TABLE IF EXISTS $schema.$batch;
235 CREATE TABLE $schema.$batch (
239 imported BOOLEAN DEFAULT FALSE,
240 to_import BOOLEAN DEFAULT TRUE,
247 binmode STDIN, ':utf8';
248 my $ins = $dbh->prepare("INSERT INTO $schema.$batch (marc, bib_id) VALUES (?, ?)");
253 report_progress("Records staged", $i);
257 my $marc = MARC::Record->new_from_usmarc($_);
258 my $bibid = $marc->subfield('901', 'c');
259 if ($bibid !~ /^\d+$/) {
260 print STDERR "Record $i is suspect; skipping\n";
263 my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record());
264 $ins->execute($xml, $bibid);
267 report_progress("Records staged", $i) if 0 != $i % 100;
269 CREATE INDEX ${batch}_bib_id_idx ON
270 $schema.$batch (bib_id);
273 CREATE INDEX ${batch}_id_idx ON
278 sub handle_filter_bibs {
284 my $sth1 = $dbh->prepare(qq{
285 UPDATE $schema.$batch
286 SET to_import = FALSE,
287 skip_reason = 'deleted'
290 FROM biblio.record_entry
297 my $ct = $sth1->rows;
298 report_progress("Filtering out $ct records that are currently deleted");
300 my $sth2 = $dbh->prepare(qq{
301 UPDATE $schema.$batch
302 SET to_import = FALSE,
303 skip_reason = 'edited after cutoff of $cutoff'
306 FROM biblio.record_entry
312 $sth2->execute($cutoff);
314 report_progress("Filtering out $ct records edited after cutoff date of $cutoff");
316 my $sth3 = $dbh->prepare(qq{
317 UPDATE $schema.$batch
318 SET to_import = FALSE,
319 skip_reason = 'XML is not well-formed'
320 WHERE NOT xml_is_well_formed(marc)
326 report_progress("Filtering out $ct records whose XML is not well-formed");
329 sub handle_load_bibs {
335 my $getct = $dbh->prepare(qq{
342 my $max = $getct->fetchrow_arrayref()->[0];
344 report_progress('Number of bibs to update', $max);
345 for (my $i = 1; $i <= $max; $i++) {
346 report_progress('... bibs updated', $i) if 0 == $i % 10 or $i == $max;
349 UPDATE biblio.record_entry a
351 FROM $schema.$batch b
352 WHERE a.id = b.bib_id
363 UPDATE $schema.$batch
379 sub handle_stage_auths {
385 DROP TABLE IF EXISTS $schema.auths_$batch;
388 CREATE TABLE $schema.auths_$batch (
393 existing_heading TEXT,
396 cancelled_auth_id BIGINT,
398 lccn_matched BOOLEAN DEFAULT FALSE,
399 heading_matched BOOLEAN DEFAULT FALSE,
400 imported BOOLEAN DEFAULT FALSE,
401 to_import BOOLEAN DEFAULT TRUE,
408 binmode STDIN, ':utf8';
409 my $ins = $dbh->prepare(qq{
410 INSERT INTO $schema.auths_$batch (marc, auth_id, lccn, cancelled_lccn, heading)
411 VALUES (?, ?, ?, ?, authority.normalize_heading(?))
417 report_progress("Records staged", $i);
421 my $marc = MARC::Record->new_from_usmarc($_);
422 my $authid = $marc->subfield('901', 'c');
423 if (defined($authid) && $authid !~ /^\d+$/) {
426 my $lccn = $marc->subfield('010', 'a');
432 my $cancelled_lccn = $marc->subfield('010', 'z');
433 if (defined $cancelled_lccn) {
434 $cancelled_lccn =~ s/^\s+//;
435 $cancelled_lccn =~ s/\s+$//;
436 $cancelled_lccn =~ s/\s+/ /g;
438 my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record());
439 $ins->execute($xml, $authid, $lccn, $cancelled_lccn, $xml);
442 report_progress("Records staged", $i) if 0 != $i % 100;
444 CREATE INDEX auths_${batch}_auth_id_idx ON
445 $schema.auths_$batch (auth_id);
448 CREATE INDEX auths_${batch}_id_idx ON
449 $schema.auths_$batch (id);
452 CREATE INDEX auths_${batch}_lccn_idx ON
453 $schema.auths_$batch (lccn);
457 sub handle_match_auths {
458 my ($dbh, $schema, $batch) = @_;
460 my $sth = $dbh->prepare(qq{
461 UPDATE $schema.auths_${batch} a
462 SET auth_id = b.record,
464 existing_heading = authority.normalize_heading(c.marc)
465 FROM authority.full_rec b
466 JOIN authority.record_entry c ON (b.record = c.id)
471 AND lccn IS NOT NULL;
475 report_progress("Matched $ct authorities on LCCN");
477 $sth = $dbh->prepare(qq{
478 UPDATE $schema.auths_${batch} a
479 SET cancelled_auth_id = b.record
480 FROM authority.full_rec b
483 AND value = cancelled_lccn
485 AND cancelled_lccn IS NOT NULL;
489 report_progress("Matched $ct authorities on cancelled LCCN");
491 $sth = $dbh->prepare(qq{
492 UPDATE $schema.auths_$batch a
494 heading_matched = TRUE,
495 existing_heading = b.heading
496 FROM authority.record_entry b
497 WHERE a.heading = b.heading
502 report_progress("Matched $ct authorities on heading");
505 sub handle_load_new_auths {
510 my $getct = $dbh->prepare(qq{
512 FROM $schema.auths_$batch
515 AND new_auth_id IS NULL
517 AND cancelled_auth_id IS NULL
520 my $max = $getct->fetchrow_arrayref()->[0];
522 report_progress('Number of authorities to add', $max);
523 for (my $i = 1; $i <= $max; $i++) {
524 report_progress('... authorities added', $i) if 0 == $i % 10 or $i == $max;
527 INSERT INTO authority.record_entry (marc, last_xact_id)
528 SELECT marc, ? || '-' || id
529 FROM $schema.auths_$batch b
532 FROM $schema.auths_$batch
535 AND new_auth_id IS NULL
537 AND cancelled_auth_id IS NULL
541 }, {}, "auths_$batch");
543 UPDATE $schema.auths_$batch
545 new_auth_id = CURRVAL('authority.record_entry_id_seq')
548 FROM $schema.auths_$batch
551 AND new_auth_id IS NULL
553 AND cancelled_auth_id IS NULL
562 sub handle_overlay_auths_stage1 {
567 my $getct = $dbh->prepare(qq{
569 FROM $schema.auths_$batch
573 AND heading = existing_heading
576 my $max = $getct->fetchrow_arrayref()->[0];
577 report_progress('Number of auths to update', $max);
580 UPDATE config.internal_flag SET enabled = TRUE
581 WHERE name = 'ingest.disable_authority_auto_update';
583 for (my $i = 1; $i <= $max; $i++) {
584 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
587 UPDATE authority.record_entry a
590 FROM $schema.auths_$batch b
591 WHERE a.id = b.auth_id
594 FROM $schema.auths_$batch
598 AND heading = existing_heading
604 UPDATE $schema.auths_$batch
608 FROM $schema.auths_$batch
612 AND heading = existing_heading
620 UPDATE config.internal_flag SET enabled = FALSE
621 WHERE name = 'ingest.disable_authority_auto_update';
625 sub handle_overlay_auths_stage2 {
630 my $getct = $dbh->prepare(qq{
632 FROM $schema.auths_$batch
636 AND heading <> existing_heading
639 my $max = $getct->fetchrow_arrayref()->[0];
640 report_progress('Number of auths to update', $max);
642 for (my $i = 1; $i <= $max; $i++) {
643 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
646 UPDATE authority.record_entry a
649 FROM $schema.auths_$batch b
650 WHERE a.id = b.auth_id
653 FROM $schema.auths_$batch
657 AND heading <> existing_heading
663 UPDATE $schema.auths_$batch
667 FROM $schema.auths_$batch
671 AND heading <> existing_heading
680 sub handle_overlay_auths_stage3 {
685 my $getct = $dbh->prepare(qq{
687 FROM $schema.auths_$batch
692 auth_id = cancelled_auth_id
694 AND cancelled_auth_id IS NOT NULL
697 my $max = $getct->fetchrow_arrayref()->[0];
698 report_progress('Number of auths to update', $max);
700 for (my $i = 1; $i <= $max; $i++) {
701 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
704 UPDATE authority.record_entry a
707 FROM $schema.auths_$batch b
708 WHERE a.id = b.cancelled_auth_id
709 AND cancelled_auth_id IN (
711 FROM $schema.auths_$batch
716 auth_id = cancelled_auth_id
718 AND cancelled_auth_id IS NOT NULL
724 UPDATE $schema.auths_$batch
726 WHERE cancelled_auth_id IN (
727 SELECT cancelled_auth_id
728 FROM $schema.auths_$batch
733 auth_id = cancelled_auth_id
735 AND cancelled_auth_id IS NOT NULL
744 sub handle_link_auth_auth {
750 UPDATE config.internal_flag SET enabled = TRUE
751 WHERE name = 'ingest.disable_authority_auto_update';
754 my $sth = $dbh->prepare(qq{
755 SELECT COALESCE(new_auth_id, auth_id, cancelled_auth_id) AS id
756 FROM $schema.auths_$batch
761 my @ids = map { $_->{id} } @{ $sth->fetchall_arrayref({}) };
763 report_progress(scalar(@ids) . " records to do auth-auth linking");
764 foreach my $id (@ids) {
766 report_progress('... auth-auth linkings processed', $i) if 0 == $i % 10 or $i == scalar(@ids);
767 system "/openils/bin/authority_authority_linker.pl -r $id -c /openils/conf/opensrf_core.xml";
771 UPDATE config.internal_flag SET enabled = FALSE
772 WHERE name = 'ingest.disable_authority_auto_update';
776 sub handle_link_auth_bib {
781 my $sth = $dbh->prepare(qq{
788 my @ids = map { $_->{id} } @{ $sth->fetchall_arrayref({}) };
790 report_progress(scalar(@ids) . " records to do auth-bib linking");
791 foreach my $id (@ids) {
793 report_progress('... auth-bib linkings processed', $i) if 0 == $i % 10 or $i == scalar(@ids);
794 system "/openils/bin/authority_control_fields.pl --record $id -c /openils/conf/opensrf_core.xml";