3 # Copyright (c) 2016 Equinox Software, Inc.
4 # Author: Galen Charlton <gmc@esilibrary.com>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2, or (at your option)
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
24 use MARC::File::XML (BinaryEncoding => 'utf8');
26 use OpenILS::Application::AppUtils;
29 my $schema = 'bib_loads';
41 'action:s' => \$action,
42 'schema:s' => \$schema,
44 'dbuser:s' => \$dbuser,
45 'dbhost:s' => \$dbhost,
48 'cutoff:s' => \$cutoff,
50 'output:s' => \$output,
51 'link-skipped' => \$link_skipped,
54 abort('must specify --action') unless defined $action;
55 abort('must specify --schema') unless defined $schema;
56 abort('must specify --db') unless defined $db;
57 abort('must specify --dbuser') unless defined $dbuser;
58 abort('must specify --dbhost') unless defined $dbhost;
59 abort('must specify --dbpw') unless defined $dbpw;
60 abort('must specify --batch') unless defined $batch;
62 abort(q{--action must be "stage_bibs", "filter_bibs", "load_bibs", "stage_auths",
63 "match_auths", "load_new_auths", "overlay_auths_stage1",
64 "overlay_auths_stage2", "overlay_auths_stage3", "link_auth_auth",
65 "link_auth_bib", "export_skipped_bibs", or "export_skipped_auths"}) unless
66 $action eq 'filter_bibs' or
67 $action eq 'stage_bibs' or
68 $action eq 'load_bibs' or
69 $action eq 'stage_auths' or
70 $action eq 'match_auths' or
71 $action eq 'load_new_auths' or
72 $action eq 'overlay_auths_stage1' or
73 $action eq 'overlay_auths_stage2' or
74 $action eq 'overlay_auths_stage3' or
75 $action eq 'link_auth_auth' or
76 $action eq 'link_auth_bib' or
77 $action eq 'export_skipped_bibs' or
78 $action eq 'export_skipped_auths'
81 my $dbh = connect_db($db, $dbuser, $dbpw, $dbhost);
83 if ($action eq 'stage_bibs') {
84 abort('must specify at least one input file') unless @ARGV;
85 handle_stage_bibs($dbh, $schema, $batch);
88 if ($action eq 'filter_bibs') {
89 abort('must specify cutoff date when filtering') unless defined $cutoff;
90 handle_filter_bibs($dbh, $schema, $batch, $cutoff);
93 if ($action eq 'load_bibs' ) {
94 handle_load_bibs($dbh, $schema, $batch, $wait);
97 if ($action eq 'stage_auths') {
98 abort('must specify at least one input file') unless @ARGV;
99 handle_stage_auths($dbh, $schema, $batch);
102 if ($action eq 'match_auths') {
103 handle_match_auths($dbh, $schema, $batch);
106 if ($action eq 'load_new_auths') {
107 handle_load_new_auths($dbh, $schema, $batch);
110 if ($action eq 'overlay_auths_stage1') {
111 handle_overlay_auths_stage1($dbh, $schema, $batch);
113 if ($action eq 'overlay_auths_stage2') {
114 handle_overlay_auths_stage2($dbh, $schema, $batch);
116 if ($action eq 'overlay_auths_stage3') {
117 handle_overlay_auths_stage3($dbh, $schema, $batch);
120 if ($action eq 'link_auth_auth') {
121 handle_link_auth_auth($dbh, $schema, $batch);
123 if ($action eq 'link_auth_bib') {
124 handle_link_auth_bib($dbh, $schema, $batch, $link_skipped);
127 if ($action eq 'export_skipped_bibs') {
128 abort('must specify output file') unless defined $output;
129 handle_export_skipped_bibs($dbh, $schema, $batch, $output);
131 if ($action eq 'export_skipped_auths') {
132 abort('must specify output file') unless defined $output;
133 handle_export_skipped_auths($dbh, $schema, $batch, $output);
138 print STDERR "$0: $msg", "\n";
146 Utility to stage and overlay bib records in an Evergreen database. This
147 expects that the incoming records will have been previously exported
148 from that Evergreen database and modified in some fashion (e.g., for
149 authority record processing) and that the bib ID can be found in the
152 This program has several modes controlled by the --action switch:
154 --action stage_bibs - load MARC bib records into a staging table
155 --action filter_bibs - mark previously staged bibs that should
156 be excluded from a subsequent load, either
157 because the target bib is deleted in Evergreen
158 or the record was modified after a date
159 specified by the --cutoff switch
160 --action load_bibs - overlay bib records using a previously staged
161 batch, one at a time. After each bib, it will
162 wait the number of seconds specified by the
165 --action stage_auths - load MARC authorities into staging
167 --action match_auths - identify matches with authority
168 records already present in the
169 Evergreen database; matching is
170 based on LCCN, cancelled LCCN, and
172 --action load_new_auths - load new (unmatched) authorities
173 --action overlay_auths_stage1 - overlay based on LCCN where
174 heading has NOT changed; this step
175 disables propagation to bib records
176 --action overlay_auths_stage2 - overlay based on LCCN where heading
177 HAS changed; propagates changes
179 --action overlay_auths_stage3 - overlay for records where a cancelled
180 LCCN is replaced with a new one
181 --action link_auth_auth - run authority_authority_linker.pl for
182 the authorities that were overlaid
183 or added in this batch.
184 --action link_auth_bib - run authority_control_fields.pl for
185 the bibs that were overlaid in this
186 batch. Add --link-skipped to specify
187 that bibs that were matched but
188 skipped due to having be edited after
189 the cutoff should be linked (rather
190 than linking the imported bibs)
191 --action export_skipped_bibs - export to ISO2709 file whose name is
192 specified by --output those bibs
193 that had been edited after the cutoff.
194 --action export_skipped_auths - export to ISO2709 file whose name is
195 specified by --output those authorities
196 that could not be definitively
197 handled as updates or adds.
199 Several switches are used regardless of the specified action:
201 --schema - Pg schema in which staging table will live; should be
203 --batch - name of bib batch; will also be used as the name
204 of the staging tables
206 --dbuser - database user
207 --dbpw - database password
208 --dbhost - database host
212 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
213 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
214 --action stage_bibs -- file1.mrc file2.mrc [...]
216 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
217 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
218 --action filter_bibs --cutoff 2016-01-02
220 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
221 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
222 --action load_bibs --wait 2
228 sub report_progress {
229 my ($msg, $counter) = @_;
230 if (defined $counter) {
231 print STDERR "$msg: $counter\n";
233 print STDERR "$msg\n";
238 my ($db, $dbuser, $dbpw, $dbhost) = @_;
240 my $dsn = "dbi:Pg:host=$dbhost;dbname=$db;port=5432";
243 ShowErrorStatement => 1,
248 my $dbh = DBI->connect($dsn, $dbuser, $dbpw, $attrs);
253 sub handle_stage_bibs {
259 DROP TABLE IF EXISTS $schema.$batch;
262 CREATE TABLE $schema.$batch (
266 imported BOOLEAN DEFAULT FALSE,
267 to_import BOOLEAN DEFAULT TRUE,
274 binmode STDIN, ':utf8';
275 my $ins = $dbh->prepare("INSERT INTO $schema.$batch (marc, bib_id) VALUES (?, ?)");
280 report_progress("Records staged", $i);
285 my $marc = MARC::Record->new_from_usmarc($_);
286 my $bibid = $marc->subfield('901', 'c');
287 if ($bibid !~ /^\d+$/) {
288 die('Subfield 901$c is not numeric or missing.');
290 my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record());
291 $ins->execute($xml, $bibid);
294 warn("Record $i is bad: $@; skipping.");
299 report_progress("Records staged", $i) if 0 != $i % 100;
301 CREATE INDEX ${batch}_bib_id_idx ON
302 $schema.$batch (bib_id);
305 CREATE INDEX ${batch}_id_idx ON
310 sub handle_filter_bibs {
316 my $sth1 = $dbh->prepare(qq{
317 UPDATE $schema.$batch
318 SET to_import = FALSE,
319 skip_reason = 'deleted'
322 FROM biblio.record_entry
329 my $ct = $sth1->rows;
330 report_progress("Filtering out $ct records that are currently deleted");
332 my $sth2 = $dbh->prepare(qq{
333 UPDATE $schema.$batch
334 SET to_import = FALSE,
335 skip_reason = 'edited after cutoff of $cutoff'
338 FROM biblio.record_entry
344 $sth2->execute($cutoff);
346 report_progress("Filtering out $ct records edited after cutoff date of $cutoff");
348 my $sth3 = $dbh->prepare(qq{
349 UPDATE $schema.$batch
350 SET to_import = FALSE,
351 skip_reason = 'XML is not well-formed'
352 WHERE NOT xml_is_well_formed(marc)
358 report_progress("Filtering out $ct records whose XML is not well-formed");
361 sub handle_load_bibs {
367 my $getct = $dbh->prepare(qq{
374 my $max = $getct->fetchrow_arrayref()->[0];
376 report_progress('Number of bibs to update', $max);
377 for (my $i = 1; $i <= $max; $i++) {
378 report_progress('... bibs updated', $i) if 0 == $i % 10 or $i == $max;
381 UPDATE biblio.record_entry a
383 FROM $schema.$batch b
384 WHERE a.id = b.bib_id
395 UPDATE $schema.$batch
411 sub handle_stage_auths {
417 DROP TABLE IF EXISTS $schema.auths_$batch;
420 CREATE TABLE $schema.auths_$batch (
425 existing_heading TEXT,
428 cancelled_auth_id BIGINT,
430 lccn_matched BOOLEAN DEFAULT FALSE,
431 heading_matched BOOLEAN DEFAULT FALSE,
432 imported BOOLEAN DEFAULT FALSE,
433 to_import BOOLEAN DEFAULT TRUE,
440 binmode STDIN, ':utf8';
441 my $ins = $dbh->prepare(qq{
442 INSERT INTO $schema.auths_$batch (marc, auth_id, lccn, cancelled_lccn, heading)
443 VALUES (?, ?, ?, ?, authority.normalize_heading(?))
449 report_progress("Records staged", $i);
454 my $marc = MARC::Record->new_from_usmarc($_);
455 my $authid = $marc->subfield('901', 'c');
456 if (defined($authid) && $authid !~ /^\d+$/) {
459 my $lccn = $marc->subfield('010', 'a');
465 my $cancelled_lccn = $marc->subfield('010', 'z');
466 if (defined $cancelled_lccn) {
467 $cancelled_lccn =~ s/^\s+//;
468 $cancelled_lccn =~ s/\s+$//;
469 $cancelled_lccn =~ s/\s+/ /g;
471 my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record());
472 $ins->execute($xml, $authid, $lccn, $cancelled_lccn, $xml);
475 warn("Record $i is bad: $@; skipping.");
480 report_progress("Records staged", $i) if 0 != $i % 100;
482 CREATE INDEX auths_${batch}_auth_id_idx ON
483 $schema.auths_$batch (auth_id);
486 CREATE INDEX auths_${batch}_id_idx ON
487 $schema.auths_$batch (id);
490 CREATE INDEX auths_${batch}_lccn_idx ON
491 $schema.auths_$batch (lccn);
495 sub handle_match_auths {
496 my ($dbh, $schema, $batch) = @_;
498 my $sth = $dbh->prepare(qq{
499 UPDATE $schema.auths_${batch} a
500 SET auth_id = b.record,
502 existing_heading = authority.normalize_heading(c.marc)
503 FROM authority.full_rec b
504 JOIN authority.record_entry c ON (b.record = c.id)
509 AND lccn IS NOT NULL;
513 report_progress("Matched $ct authorities on LCCN");
515 $sth = $dbh->prepare(qq{
516 UPDATE $schema.auths_${batch} a
517 SET cancelled_auth_id = b.record
518 FROM authority.full_rec b
521 AND value = cancelled_lccn
523 AND cancelled_lccn IS NOT NULL;
527 report_progress("Matched $ct authorities on cancelled LCCN");
529 $sth = $dbh->prepare(qq{
530 UPDATE $schema.auths_$batch a
532 heading_matched = TRUE,
533 existing_heading = b.heading
534 FROM authority.record_entry b
535 WHERE a.heading = b.heading
540 report_progress("Matched $ct authorities on heading");
543 sub handle_load_new_auths {
548 my $getct = $dbh->prepare(qq{
550 FROM $schema.auths_$batch
553 AND new_auth_id IS NULL
555 AND cancelled_auth_id IS NULL
558 my $max = $getct->fetchrow_arrayref()->[0];
560 report_progress('Number of authorities to add', $max);
561 for (my $i = 1; $i <= $max; $i++) {
562 report_progress('... authorities added', $i) if 0 == $i % 10 or $i == $max;
565 INSERT INTO authority.record_entry (marc, last_xact_id)
566 SELECT marc, ? || '-' || id
567 FROM $schema.auths_$batch b
570 FROM $schema.auths_$batch
573 AND new_auth_id IS NULL
575 AND cancelled_auth_id IS NULL
579 }, {}, "auths_$batch");
581 UPDATE $schema.auths_$batch
583 new_auth_id = CURRVAL('authority.record_entry_id_seq')
586 FROM $schema.auths_$batch
589 AND new_auth_id IS NULL
591 AND cancelled_auth_id IS NULL
601 sub handle_overlay_auths_stage1 {
606 my $getct = $dbh->prepare(qq{
608 FROM $schema.auths_$batch
612 AND heading = existing_heading
615 my $max = $getct->fetchrow_arrayref()->[0];
616 report_progress('Number of auths to update', $max);
619 UPDATE config.internal_flag SET enabled = TRUE
620 WHERE name = 'ingest.disable_authority_auto_update';
622 for (my $i = 1; $i <= $max; $i++) {
623 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
626 UPDATE authority.record_entry a
629 FROM $schema.auths_$batch b
630 WHERE a.id = b.auth_id
633 FROM $schema.auths_$batch
637 AND heading = existing_heading
643 UPDATE $schema.auths_$batch
647 FROM $schema.auths_$batch
651 AND heading = existing_heading
659 UPDATE config.internal_flag SET enabled = FALSE
660 WHERE name = 'ingest.disable_authority_auto_update';
664 sub handle_overlay_auths_stage2 {
669 my $getct = $dbh->prepare(qq{
671 FROM $schema.auths_$batch
675 AND heading <> existing_heading
678 my $max = $getct->fetchrow_arrayref()->[0];
679 report_progress('Number of auths to update', $max);
681 for (my $i = 1; $i <= $max; $i++) {
682 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
685 UPDATE authority.record_entry a
688 FROM $schema.auths_$batch b
689 WHERE a.id = b.auth_id
692 FROM $schema.auths_$batch
696 AND heading <> existing_heading
702 UPDATE $schema.auths_$batch
706 FROM $schema.auths_$batch
710 AND heading <> existing_heading
719 sub handle_overlay_auths_stage3 {
724 my $getct = $dbh->prepare(qq{
726 FROM $schema.auths_$batch
731 auth_id = cancelled_auth_id
733 AND cancelled_auth_id IS NOT NULL
736 my $max = $getct->fetchrow_arrayref()->[0];
737 report_progress('Number of auths to update', $max);
739 for (my $i = 1; $i <= $max; $i++) {
740 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
743 UPDATE authority.record_entry a
746 FROM $schema.auths_$batch b
747 WHERE a.id = b.cancelled_auth_id
748 AND cancelled_auth_id IN (
750 FROM $schema.auths_$batch
755 auth_id = cancelled_auth_id
757 AND cancelled_auth_id IS NOT NULL
763 UPDATE $schema.auths_$batch
765 WHERE cancelled_auth_id IN (
766 SELECT cancelled_auth_id
767 FROM $schema.auths_$batch
772 auth_id = cancelled_auth_id
774 AND cancelled_auth_id IS NOT NULL
783 sub handle_link_auth_auth {
789 UPDATE config.internal_flag SET enabled = TRUE
790 WHERE name = 'ingest.disable_authority_auto_update';
793 my $sth = $dbh->prepare(qq{
794 SELECT COALESCE(new_auth_id, auth_id, cancelled_auth_id) AS id
795 FROM $schema.auths_$batch
800 my @ids = map { $_->{id} } @{ $sth->fetchall_arrayref({}) };
802 report_progress(scalar(@ids) . " records to do auth-auth linking");
803 foreach my $id (@ids) {
805 report_progress('... auth-auth linkings processed', $i) if 0 == $i % 10 or $i == scalar(@ids);
806 system "/openils/bin/authority_authority_linker.pl -r $id -c /openils/conf/opensrf_core.xml";
810 UPDATE config.internal_flag SET enabled = FALSE
811 WHERE name = 'ingest.disable_authority_auto_update';
815 sub handle_link_auth_bib {
819 my $link_skipped = shift;
827 AND skip_reason ~ '^edit'
839 my $sth = $dbh->prepare($query);
841 my @ids = map { $_->{id} } @{ $sth->fetchall_arrayref({}) };
843 report_progress(scalar(@ids) . " records to do auth-bib linking");
844 foreach my $id (@ids) {
846 report_progress('... auth-bib linkings processed', $i) if 0 == $i % 10 or $i == scalar(@ids);
847 system "/openils/bin/authority_control_fields.pl --record $id -c /openils/conf/opensrf_core.xml";
852 sub handle_export_skipped_bibs {
859 open($outfh, '>', $output) or die("Could not open input file $output: $!\n");
860 binmode $outfh, ':utf8';
862 my $sth = $dbh->prepare(qq{
865 WHERE skip_reason ~ '^edit'
870 while (my $row = $sth->fetchrow_hashref()) {
871 my $marc = MARC::Record->new_from_xml($row->{marc});
872 print $outfh $marc->as_usmarc();
877 sub handle_export_skipped_auths {
884 open($outfh, '>', $output) or die("Could not open input file $output: $!\n");
885 binmode $outfh, ':utf8';
887 my $sth = $dbh->prepare(qq{
889 FROM $schema.auths_$batch
895 while (my $row = $sth->fetchrow_hashref()) {
896 my $marc = MARC::Record->new_from_xml($row->{marc});
897 print $outfh $marc->as_usmarc();