3 # Copyright (c) 2016 Equinox Software, Inc.
4 # Author: Galen Charlton <gmc@esilibrary.com>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2, or (at your option)
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
24 use MARC::File::XML (BinaryEncoding => 'utf8');
26 use OpenILS::Application::AppUtils;
29 my $schema = 'bib_loads';
41 'action:s' => \$action,
42 'schema:s' => \$schema,
44 'dbuser:s' => \$dbuser,
45 'dbhost:s' => \$dbhost,
48 'cutoff:s' => \$cutoff,
50 'output:s' => \$output,
51 'link-skipped' => \$link_skipped,
54 abort('must specify --action') unless defined $action;
55 abort('must specify --schema') unless defined $schema;
56 abort('must specify --db') unless defined $db;
57 abort('must specify --dbuser') unless defined $dbuser;
58 abort('must specify --dbhost') unless defined $dbhost;
59 abort('must specify --dbpw') unless defined $dbpw;
60 abort('must specify --batch') unless defined $batch;
62 abort(q{--action must be "stage_bibs", "filter_bibs", "load_bibs", "stage_auths",
63 "match_auths", "load_new_auths", "overlay_auths_stage1",
64 "overlay_auths_stage2", "overlay_auths_stage3", "link_auth_auth",
65 "link_auth_bib", "export_skipped_bibs", or "export_skipped_auths"}) unless
66 $action eq 'filter_bibs' or
67 $action eq 'stage_bibs' or
68 $action eq 'load_bibs' or
69 $action eq 'stage_auths' or
70 $action eq 'match_auths' or
71 $action eq 'load_new_auths' or
72 $action eq 'overlay_auths_stage1' or
73 $action eq 'overlay_auths_stage2' or
74 $action eq 'overlay_auths_stage3' or
75 $action eq 'link_auth_auth' or
76 $action eq 'link_auth_bib' or
77 $action eq 'export_skipped_bibs' or
78 $action eq 'export_skipped_auths'
81 my $dbh = connect_db($db, $dbuser, $dbpw, $dbhost);
83 if ($action eq 'stage_bibs') {
84 abort('must specify at least one input file') unless @ARGV;
85 handle_stage_bibs($dbh, $schema, $batch);
88 if ($action eq 'filter_bibs') {
89 abort('must specify cutoff date when filtering') unless defined $cutoff;
90 handle_filter_bibs($dbh, $schema, $batch, $cutoff);
93 if ($action eq 'load_bibs' ) {
94 handle_load_bibs($dbh, $schema, $batch, $wait);
97 if ($action eq 'stage_auths') {
98 abort('must specify at least one input file') unless @ARGV;
99 handle_stage_auths($dbh, $schema, $batch);
102 if ($action eq 'match_auths') {
103 handle_match_auths($dbh, $schema, $batch);
106 if ($action eq 'load_new_auths') {
107 handle_load_new_auths($dbh, $schema, $batch);
110 if ($action eq 'overlay_auths_stage1') {
111 handle_overlay_auths_stage1($dbh, $schema, $batch);
113 if ($action eq 'overlay_auths_stage2') {
114 handle_overlay_auths_stage2($dbh, $schema, $batch);
116 if ($action eq 'overlay_auths_stage3') {
117 handle_overlay_auths_stage3($dbh, $schema, $batch);
120 if ($action eq 'link_auth_auth') {
121 handle_link_auth_auth($dbh, $schema, $batch);
123 if ($action eq 'link_auth_bib') {
124 handle_link_auth_bib($dbh, $schema, $batch, $link_skipped);
127 if ($action eq 'export_skipped_bibs') {
128 abort('must specify output file') unless defined $output;
129 handle_export_skipped_bibs($dbh, $schema, $batch, $output);
131 if ($action eq 'export_skipped_auths') {
132 abort('must specify output file') unless defined $output;
133 handle_export_skipped_auths($dbh, $schema, $batch, $output);
138 print STDERR "$0: $msg", "\n";
146 Utility to stage and overlay bib records in an Evergreen database. This
147 expects that the incoming records will have been previously exported
148 from that Evergreen database and modified in some fashion (e.g., for
149 authority record processing) and that the bib ID can be found in the
152 This program has several modes controlled by the --action switch:
154 --action stage_bibs - load MARC bib records into a staging table
155 --action filter_bibs - mark previously staged bibs that should
156 be excluded from a subsequent load, either
157 because the target bib is deleted in Evergreen
158 or the record was modified after a date
159 specified by the --cutoff switch
160 --action load_bibs - overlay bib records using a previously staged
161 batch, one at a time. After each bib, it will
162 wait the number of seconds specified by the
165 --action stage_auths - load MARC authorities into staging
167 --action match_auths - identify matches with authority
168 records already present in the
169 Evergreen database; matching is
170 based on LCCN, cancelled LCCN, and
172 --action load_new_auths - load new (unmatched) authorities
173 --action overlay_auths_stage1 - overlay based on LCCN where
174 heading has NOT changed; this step
175 disables propagation to bib records
176 --action overlay_auths_stage2 - overlay based on LCCN where heading
177 HAS changed; propagates changes
179 --action overlay_auths_stage3 - overlay for records where a cancelled
180 LCCN is replaced with a new one
181 --action link_auth_auth - run authority_authority_linker.pl for
182 the authorities that were overlaid
183 or added in this batch.
184 --action link_auth_bib - run authority_control_fields.pl for
185 the bibs that were overlaid in this
186 batch. Add --link-skipped to specify
187 that bibs that were matched but
188 skipped due to having be edited after
189 the cutoff should be linked (rather
190 than linking the imported bibs)
191 --action export_skipped_bibs - export to ISO2709 file whose name is
192 specified by --output those bibs
193 that had been edited after the cutoff.
194 --action export_skipped_auths - export to ISO2709 file whose name is
195 specified by --output those authorities
196 that could not be definitively
197 handled as updates or adds.
199 Several switches are used regardless of the specified action:
201 --schema - Pg schema in which staging table will live; should be
203 --batch - name of bib batch; will also be used as the name
204 of the staging tables
206 --dbuser - database user
207 --dbpw - database password
208 --dbhost - database host
212 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
213 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
214 --action stage_bibs -- file1.mrc file2.mrc [...]
216 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
217 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
218 --action filter_bibs --cutoff 2016-01-02
220 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
221 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
222 --action load_bibs --wait 2
228 sub report_progress {
229 my ($msg, $counter) = @_;
230 if (defined $counter) {
231 print STDERR "$msg: $counter\n";
233 print STDERR "$msg\n";
238 my ($db, $dbuser, $dbpw, $dbhost) = @_;
240 my $dsn = "dbi:Pg:host=$dbhost;dbname=$db;port=5432";
243 ShowErrorStatement => 1,
248 my $dbh = DBI->connect($dsn, $dbuser, $dbpw, $attrs);
253 sub handle_stage_bibs {
259 DROP TABLE IF EXISTS $schema.$batch;
262 CREATE TABLE $schema.$batch (
266 imported BOOLEAN DEFAULT FALSE,
267 to_import BOOLEAN DEFAULT TRUE,
274 binmode STDIN, ':utf8';
275 my $ins = $dbh->prepare("INSERT INTO $schema.$batch (marc, bib_id) VALUES (?, ?)");
280 report_progress("Records staged", $i);
284 my $marc = MARC::Record->new_from_usmarc($_);
285 my $bibid = $marc->subfield('901', 'c');
286 if ($bibid !~ /^\d+$/) {
287 print STDERR "Record $i is suspect; skipping\n";
290 my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record());
291 $ins->execute($xml, $bibid);
294 report_progress("Records staged", $i) if 0 != $i % 100;
296 CREATE INDEX ${batch}_bib_id_idx ON
297 $schema.$batch (bib_id);
300 CREATE INDEX ${batch}_id_idx ON
305 sub handle_filter_bibs {
311 my $sth1 = $dbh->prepare(qq{
312 UPDATE $schema.$batch
313 SET to_import = FALSE,
314 skip_reason = 'deleted'
317 FROM biblio.record_entry
324 my $ct = $sth1->rows;
325 report_progress("Filtering out $ct records that are currently deleted");
327 my $sth2 = $dbh->prepare(qq{
328 UPDATE $schema.$batch
329 SET to_import = FALSE,
330 skip_reason = 'edited after cutoff of $cutoff'
333 FROM biblio.record_entry
339 $sth2->execute($cutoff);
341 report_progress("Filtering out $ct records edited after cutoff date of $cutoff");
343 my $sth3 = $dbh->prepare(qq{
344 UPDATE $schema.$batch
345 SET to_import = FALSE,
346 skip_reason = 'XML is not well-formed'
347 WHERE NOT xml_is_well_formed(marc)
353 report_progress("Filtering out $ct records whose XML is not well-formed");
356 sub handle_load_bibs {
362 my $getct = $dbh->prepare(qq{
369 my $max = $getct->fetchrow_arrayref()->[0];
371 report_progress('Number of bibs to update', $max);
372 for (my $i = 1; $i <= $max; $i++) {
373 report_progress('... bibs updated', $i) if 0 == $i % 10 or $i == $max;
376 UPDATE biblio.record_entry a
378 FROM $schema.$batch b
379 WHERE a.id = b.bib_id
390 UPDATE $schema.$batch
406 sub handle_stage_auths {
412 DROP TABLE IF EXISTS $schema.auths_$batch;
415 CREATE TABLE $schema.auths_$batch (
420 existing_heading TEXT,
423 cancelled_auth_id BIGINT,
425 lccn_matched BOOLEAN DEFAULT FALSE,
426 heading_matched BOOLEAN DEFAULT FALSE,
427 imported BOOLEAN DEFAULT FALSE,
428 to_import BOOLEAN DEFAULT TRUE,
435 binmode STDIN, ':utf8';
436 my $ins = $dbh->prepare(qq{
437 INSERT INTO $schema.auths_$batch (marc, auth_id, lccn, cancelled_lccn, heading)
438 VALUES (?, ?, ?, ?, authority.normalize_heading(?))
444 report_progress("Records staged", $i);
448 my $marc = MARC::Record->new_from_usmarc($_);
449 my $authid = $marc->subfield('901', 'c');
450 if (defined($authid) && $authid !~ /^\d+$/) {
453 my $lccn = $marc->subfield('010', 'a');
459 my $cancelled_lccn = $marc->subfield('010', 'z');
460 if (defined $cancelled_lccn) {
461 $cancelled_lccn =~ s/^\s+//;
462 $cancelled_lccn =~ s/\s+$//;
463 $cancelled_lccn =~ s/\s+/ /g;
465 my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record());
466 $ins->execute($xml, $authid, $lccn, $cancelled_lccn, $xml);
469 report_progress("Records staged", $i) if 0 != $i % 100;
471 CREATE INDEX auths_${batch}_auth_id_idx ON
472 $schema.auths_$batch (auth_id);
475 CREATE INDEX auths_${batch}_id_idx ON
476 $schema.auths_$batch (id);
479 CREATE INDEX auths_${batch}_lccn_idx ON
480 $schema.auths_$batch (lccn);
484 sub handle_match_auths {
485 my ($dbh, $schema, $batch) = @_;
487 my $sth = $dbh->prepare(qq{
488 UPDATE $schema.auths_${batch} a
489 SET auth_id = b.record,
491 existing_heading = authority.normalize_heading(c.marc)
492 FROM authority.full_rec b
493 JOIN authority.record_entry c ON (b.record = c.id)
498 AND lccn IS NOT NULL;
502 report_progress("Matched $ct authorities on LCCN");
504 $sth = $dbh->prepare(qq{
505 UPDATE $schema.auths_${batch} a
506 SET cancelled_auth_id = b.record
507 FROM authority.full_rec b
510 AND value = cancelled_lccn
512 AND cancelled_lccn IS NOT NULL;
516 report_progress("Matched $ct authorities on cancelled LCCN");
518 $sth = $dbh->prepare(qq{
519 UPDATE $schema.auths_$batch a
521 heading_matched = TRUE,
522 existing_heading = b.heading
523 FROM authority.record_entry b
524 WHERE a.heading = b.heading
529 report_progress("Matched $ct authorities on heading");
532 sub handle_load_new_auths {
537 my $getct = $dbh->prepare(qq{
539 FROM $schema.auths_$batch
542 AND new_auth_id IS NULL
544 AND cancelled_auth_id IS NULL
547 my $max = $getct->fetchrow_arrayref()->[0];
549 report_progress('Number of authorities to add', $max);
550 for (my $i = 1; $i <= $max; $i++) {
551 report_progress('... authorities added', $i) if 0 == $i % 10 or $i == $max;
554 INSERT INTO authority.record_entry (marc, last_xact_id)
555 SELECT marc, ? || '-' || id
556 FROM $schema.auths_$batch b
559 FROM $schema.auths_$batch
562 AND new_auth_id IS NULL
564 AND cancelled_auth_id IS NULL
568 }, {}, "auths_$batch");
570 UPDATE $schema.auths_$batch
572 new_auth_id = CURRVAL('authority.record_entry_id_seq')
575 FROM $schema.auths_$batch
578 AND new_auth_id IS NULL
580 AND cancelled_auth_id IS NULL
589 sub handle_overlay_auths_stage1 {
594 my $getct = $dbh->prepare(qq{
596 FROM $schema.auths_$batch
600 AND heading = existing_heading
603 my $max = $getct->fetchrow_arrayref()->[0];
604 report_progress('Number of auths to update', $max);
607 UPDATE config.internal_flag SET enabled = TRUE
608 WHERE name = 'ingest.disable_authority_auto_update';
610 for (my $i = 1; $i <= $max; $i++) {
611 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
614 UPDATE authority.record_entry a
617 FROM $schema.auths_$batch b
618 WHERE a.id = b.auth_id
621 FROM $schema.auths_$batch
625 AND heading = existing_heading
631 UPDATE $schema.auths_$batch
635 FROM $schema.auths_$batch
639 AND heading = existing_heading
647 UPDATE config.internal_flag SET enabled = FALSE
648 WHERE name = 'ingest.disable_authority_auto_update';
652 sub handle_overlay_auths_stage2 {
657 my $getct = $dbh->prepare(qq{
659 FROM $schema.auths_$batch
663 AND heading <> existing_heading
666 my $max = $getct->fetchrow_arrayref()->[0];
667 report_progress('Number of auths to update', $max);
669 for (my $i = 1; $i <= $max; $i++) {
670 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
673 UPDATE authority.record_entry a
676 FROM $schema.auths_$batch b
677 WHERE a.id = b.auth_id
680 FROM $schema.auths_$batch
684 AND heading <> existing_heading
690 UPDATE $schema.auths_$batch
694 FROM $schema.auths_$batch
698 AND heading <> existing_heading
707 sub handle_overlay_auths_stage3 {
712 my $getct = $dbh->prepare(qq{
714 FROM $schema.auths_$batch
719 auth_id = cancelled_auth_id
721 AND cancelled_auth_id IS NOT NULL
724 my $max = $getct->fetchrow_arrayref()->[0];
725 report_progress('Number of auths to update', $max);
727 for (my $i = 1; $i <= $max; $i++) {
728 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
731 UPDATE authority.record_entry a
734 FROM $schema.auths_$batch b
735 WHERE a.id = b.cancelled_auth_id
736 AND cancelled_auth_id IN (
738 FROM $schema.auths_$batch
743 auth_id = cancelled_auth_id
745 AND cancelled_auth_id IS NOT NULL
751 UPDATE $schema.auths_$batch
753 WHERE cancelled_auth_id IN (
754 SELECT cancelled_auth_id
755 FROM $schema.auths_$batch
760 auth_id = cancelled_auth_id
762 AND cancelled_auth_id IS NOT NULL
771 sub handle_link_auth_auth {
777 UPDATE config.internal_flag SET enabled = TRUE
778 WHERE name = 'ingest.disable_authority_auto_update';
781 my $sth = $dbh->prepare(qq{
782 SELECT COALESCE(new_auth_id, auth_id, cancelled_auth_id) AS id
783 FROM $schema.auths_$batch
788 my @ids = map { $_->{id} } @{ $sth->fetchall_arrayref({}) };
790 report_progress(scalar(@ids) . " records to do auth-auth linking");
791 foreach my $id (@ids) {
793 report_progress('... auth-auth linkings processed', $i) if 0 == $i % 10 or $i == scalar(@ids);
794 system "/openils/bin/authority_authority_linker.pl -r $id -c /openils/conf/opensrf_core.xml";
798 UPDATE config.internal_flag SET enabled = FALSE
799 WHERE name = 'ingest.disable_authority_auto_update';
803 sub handle_link_auth_bib {
807 my $link_skipped = shift;
815 AND skip_reason ~ '^edit'
827 my $sth = $dbh->prepare($query);
829 my @ids = map { $_->{id} } @{ $sth->fetchall_arrayref({}) };
831 report_progress(scalar(@ids) . " records to do auth-bib linking");
832 foreach my $id (@ids) {
834 report_progress('... auth-bib linkings processed', $i) if 0 == $i % 10 or $i == scalar(@ids);
835 system "/openils/bin/authority_control_fields.pl --record $id -c /openils/conf/opensrf_core.xml";
840 sub handle_export_skipped_bibs {
847 open($outfh, '>', $output) or die("Could not open input file $output: $!\n");
848 binmode $outfh, ':utf8';
850 my $sth = $dbh->prepare(qq{
853 WHERE skip_reason ~ '^edit'
858 while (my $row = $sth->fetchrow_hashref()) {
859 my $marc = MARC::Record->new_from_xml($row->{marc});
860 print $outfh $marc->as_usmarc();
865 sub handle_export_skipped_auths {
872 open($outfh, '>', $output) or die("Could not open input file $output: $!\n");
873 binmode $outfh, ':utf8';
875 my $sth = $dbh->prepare(qq{
877 FROM $schema.auths_$batch
883 while (my $row = $sth->fetchrow_hashref()) {
884 my $marc = MARC::Record->new_from_xml($row->{marc});
885 print $outfh $marc->as_usmarc();