3 # Copyright (c) 2016 Equinox Software, Inc.
4 # Author: Galen Charlton <gmc@esilibrary.com>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2, or (at your option)
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
24 use MARC::File::XML (BinaryEncoding => 'utf8');
26 use OpenILS::Application::AppUtils;
29 my $schema = 'bib_loads';
41 'action:s' => \$action,
42 'schema:s' => \$schema,
44 'dbuser:s' => \$dbuser,
45 'dbhost:s' => \$dbhost,
48 'cutoff:s' => \$cutoff,
50 'output:s' => \$output,
51 'link-skipped' => \$link_skipped,
54 abort('must specify --action') unless defined $action;
55 abort('must specify --schema') unless defined $schema;
56 abort('must specify --db') unless defined $db;
57 abort('must specify --dbuser') unless defined $dbuser;
58 abort('must specify --dbhost') unless defined $dbhost;
59 abort('must specify --dbpw') unless defined $dbpw;
60 abort('must specify --batch') unless defined $batch;
62 abort(q{--action must be "stage_bibs", "filter_bibs", "load_bibs", "stage_auths",
63 "match_auths", "load_new_auths", "overlay_auths_stage1",
64 "overlay_auths_stage2", "overlay_auths_stage3", "link_auth_auth",
65 "link_auth_bib", "export_skipped_bibs", or "export_skipped_auths"}) unless
66 $action eq 'filter_bibs' or
67 $action eq 'stage_bibs' or
68 $action eq 'load_bibs' or
69 $action eq 'stage_auths' or
70 $action eq 'match_auths' or
71 $action eq 'load_new_auths' or
72 $action eq 'overlay_auths_stage1' or
73 $action eq 'overlay_auths_stage2' or
74 $action eq 'overlay_auths_stage3' or
75 $action eq 'link_auth_auth' or
76 $action eq 'link_auth_bib' or
77 $action eq 'export_skipped_bibs' or
78 $action eq 'export_skipped_auths'
81 my $dbh = connect_db($db, $dbuser, $dbpw, $dbhost);
83 if ($action eq 'stage_bibs') {
84 abort('must specify at least one input file') unless @ARGV;
85 handle_stage_bibs($dbh, $schema, $batch);
88 if ($action eq 'filter_bibs') {
89 abort('must specify cutoff date when filtering') unless defined $cutoff;
90 handle_filter_bibs($dbh, $schema, $batch, $cutoff);
93 if ($action eq 'load_bibs' ) {
94 handle_load_bibs($dbh, $schema, $batch, $wait);
97 if ($action eq 'stage_auths') {
98 abort('must specify at least one input file') unless @ARGV;
99 handle_stage_auths($dbh, $schema, $batch);
102 if ($action eq 'match_auths') {
103 handle_match_auths($dbh, $schema, $batch);
106 if ($action eq 'load_new_auths') {
107 handle_load_new_auths($dbh, $schema, $batch);
110 if ($action eq 'overlay_auths_stage1') {
111 handle_overlay_auths_stage1($dbh, $schema, $batch);
113 if ($action eq 'overlay_auths_stage2') {
114 handle_overlay_auths_stage2($dbh, $schema, $batch);
116 if ($action eq 'overlay_auths_stage3') {
117 handle_overlay_auths_stage3($dbh, $schema, $batch);
120 if ($action eq 'link_auth_auth') {
121 handle_link_auth_auth($dbh, $schema, $batch);
123 if ($action eq 'link_auth_bib') {
124 handle_link_auth_bib($dbh, $schema, $batch, $link_skipped);
127 if ($action eq 'export_skipped_bibs') {
128 abort('must specify output file') unless defined $output;
129 handle_export_skipped_bibs($dbh, $schema, $batch, $output);
131 if ($action eq 'export_skipped_auths') {
132 abort('must specify output file') unless defined $output;
133 handle_export_skipped_auths($dbh, $schema, $batch, $output);
138 print STDERR "$0: $msg", "\n";
146 Utility to stage and overlay bib records in an Evergreen database. This
147 expects that the incoming records will have been previously exported
148 from that Evergreen database and modified in some fashion (e.g., for
149 authority record processing) and that the bib ID can be found in the
152 This program has several modes controlled by the --action switch:
154 --action stage_bibs - load MARC bib records into a staging table
155 --action filter_bibs - mark previously staged bibs that should
156 be excluded from a subsequent load, either
157 because the target bib is deleted in Evergreen
158 or the record was modified after a date
159 specified by the --cutoff switch
160 --action load_bibs - overlay bib records using a previously staged
161 batch, one at a time. After each bib, it will
162 wait the number of seconds specified by the
165 --action stage_auths - load MARC authorities into staging
167 --action match_auths - identify matches with authority
168 records already present in the
169 Evergreen database; matching is
170 based on LCCN, cancelled LCCN, and
172 --action load_new_auths - load new (unmatched) authorities
173 --action overlay_auths_stage1 - overlay based on LCCN where
174 heading has NOT changed; this step
175 disables propagation to bib records
176 --action overlay_auths_stage2 - overlay based on LCCN where heading
177 HAS changed; propagates changes
179 --action overlay_auths_stage3 - overlay for records where a cancelled
180 LCCN is replaced with a new one
181 --action link_auth_auth - run authority_authority_linker.pl for
182 the authorities that were overlaid
183 or added in this batch.
184 --action link_auth_bib - run authority_control_fields.pl for
185 the bibs that were overlaid in this
186 batch. Add --link-skipped to specify
187 that bibs that were matched but
188 skipped due to having be edited after
189 the cutoff should be linked (rather
190 than linking the imported bibs)
191 --action export_skipped_bibs - export to ISO2709 file whose name is
192 specified by --output those bibs
193 that had been edited after the cutoff.
194 --action export_skipped_auths - export to ISO2709 file whose name is
195 specified by --output those authorities
196 that could not be definitively
197 handled as updates or adds.
199 Several switches are used regardless of the specified action:
201 --schema - Pg schema in which staging table will live; should be
203 --batch - name of bib batch; will also be used as the name
204 of the staging tables
206 --dbuser - database user
207 --dbpw - database password
208 --dbhost - database host
212 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
213 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
214 --action stage_bibs -- file1.mrc file2.mrc [...]
216 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
217 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
218 --action filter_bibs --cutoff 2016-01-02
220 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
221 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
222 --action load_bibs --wait 2
228 sub report_progress {
229 my ($msg, $counter) = @_;
230 if (defined $counter) {
231 print STDERR "$msg: $counter\n";
233 print STDERR "$msg\n";
238 my ($db, $dbuser, $dbpw, $dbhost) = @_;
240 my $dsn = "dbi:Pg:host=$dbhost;dbname=$db;port=5432";
243 ShowErrorStatement => 1,
248 my $dbh = DBI->connect($dsn, $dbuser, $dbpw, $attrs);
253 sub handle_stage_bibs {
259 DROP TABLE IF EXISTS $schema.$batch;
262 CREATE TABLE $schema.$batch (
266 imported BOOLEAN DEFAULT FALSE,
267 to_import BOOLEAN DEFAULT TRUE,
274 binmode STDIN, ':utf8';
275 my $ins = $dbh->prepare("INSERT INTO $schema.$batch (marc, bib_id) VALUES (?, ?)");
280 report_progress("Records staged", $i);
285 my $marc = MARC::Record->new_from_usmarc($_);
286 my $bibid = $marc->subfield('901', 'c');
287 if ($bibid !~ /^\d+$/) {
288 print STDERR "Record $i is suspect; skipping\n";
291 my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record());
292 $ins->execute($xml, $bibid);
295 print STDERR "Record $i is bad; skipping\n";
300 report_progress("Records staged", $i) if 0 != $i % 100;
302 CREATE INDEX ${batch}_bib_id_idx ON
303 $schema.$batch (bib_id);
306 CREATE INDEX ${batch}_id_idx ON
311 sub handle_filter_bibs {
317 my $sth1 = $dbh->prepare(qq{
318 UPDATE $schema.$batch
319 SET to_import = FALSE,
320 skip_reason = 'deleted'
323 FROM biblio.record_entry
330 my $ct = $sth1->rows;
331 report_progress("Filtering out $ct records that are currently deleted");
333 my $sth2 = $dbh->prepare(qq{
334 UPDATE $schema.$batch
335 SET to_import = FALSE,
336 skip_reason = 'edited after cutoff of $cutoff'
339 FROM biblio.record_entry
345 $sth2->execute($cutoff);
347 report_progress("Filtering out $ct records edited after cutoff date of $cutoff");
349 my $sth3 = $dbh->prepare(qq{
350 UPDATE $schema.$batch
351 SET to_import = FALSE,
352 skip_reason = 'XML is not well-formed'
353 WHERE NOT xml_is_well_formed(marc)
359 report_progress("Filtering out $ct records whose XML is not well-formed");
362 sub handle_load_bibs {
368 my $getct = $dbh->prepare(qq{
375 my $max = $getct->fetchrow_arrayref()->[0];
377 report_progress('Number of bibs to update', $max);
378 for (my $i = 1; $i <= $max; $i++) {
379 report_progress('... bibs updated', $i) if 0 == $i % 10 or $i == $max;
382 UPDATE biblio.record_entry a
384 FROM $schema.$batch b
385 WHERE a.id = b.bib_id
396 UPDATE $schema.$batch
412 sub handle_stage_auths {
418 DROP TABLE IF EXISTS $schema.auths_$batch;
421 CREATE TABLE $schema.auths_$batch (
426 existing_heading TEXT,
429 cancelled_auth_id BIGINT,
431 lccn_matched BOOLEAN DEFAULT FALSE,
432 heading_matched BOOLEAN DEFAULT FALSE,
433 imported BOOLEAN DEFAULT FALSE,
434 to_import BOOLEAN DEFAULT TRUE,
441 binmode STDIN, ':utf8';
442 my $ins = $dbh->prepare(qq{
443 INSERT INTO $schema.auths_$batch (marc, auth_id, lccn, cancelled_lccn, heading)
444 VALUES (?, ?, ?, ?, authority.normalize_heading(?))
450 report_progress("Records staged", $i);
455 my $marc = MARC::Record->new_from_usmarc($_);
456 my $authid = $marc->subfield('901', 'c');
457 if (defined($authid) && $authid !~ /^\d+$/) {
460 my $lccn = $marc->subfield('010', 'a');
466 my $cancelled_lccn = $marc->subfield('010', 'z');
467 if (defined $cancelled_lccn) {
468 $cancelled_lccn =~ s/^\s+//;
469 $cancelled_lccn =~ s/\s+$//;
470 $cancelled_lccn =~ s/\s+/ /g;
472 my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record());
473 $ins->execute($xml, $authid, $lccn, $cancelled_lccn, $xml);
476 print STDERR "Record $i is bad; skipping\n";
481 report_progress("Records staged", $i) if 0 != $i % 100;
483 CREATE INDEX auths_${batch}_auth_id_idx ON
484 $schema.auths_$batch (auth_id);
487 CREATE INDEX auths_${batch}_id_idx ON
488 $schema.auths_$batch (id);
491 CREATE INDEX auths_${batch}_lccn_idx ON
492 $schema.auths_$batch (lccn);
496 sub handle_match_auths {
497 my ($dbh, $schema, $batch) = @_;
499 my $sth = $dbh->prepare(qq{
500 UPDATE $schema.auths_${batch} a
501 SET auth_id = b.record,
503 existing_heading = authority.normalize_heading(c.marc)
504 FROM authority.full_rec b
505 JOIN authority.record_entry c ON (b.record = c.id)
510 AND lccn IS NOT NULL;
514 report_progress("Matched $ct authorities on LCCN");
516 $sth = $dbh->prepare(qq{
517 UPDATE $schema.auths_${batch} a
518 SET cancelled_auth_id = b.record
519 FROM authority.full_rec b
522 AND value = cancelled_lccn
524 AND cancelled_lccn IS NOT NULL;
528 report_progress("Matched $ct authorities on cancelled LCCN");
530 $sth = $dbh->prepare(qq{
531 UPDATE $schema.auths_$batch a
533 heading_matched = TRUE,
534 existing_heading = b.heading
535 FROM authority.record_entry b
536 WHERE a.heading = b.heading
541 report_progress("Matched $ct authorities on heading");
544 sub handle_load_new_auths {
549 my $getct = $dbh->prepare(qq{
551 FROM $schema.auths_$batch
554 AND new_auth_id IS NULL
556 AND cancelled_auth_id IS NULL
559 my $max = $getct->fetchrow_arrayref()->[0];
561 report_progress('Number of authorities to add', $max);
562 for (my $i = 1; $i <= $max; $i++) {
563 report_progress('... authorities added', $i) if 0 == $i % 10 or $i == $max;
566 INSERT INTO authority.record_entry (marc, last_xact_id)
567 SELECT marc, ? || '-' || id
568 FROM $schema.auths_$batch b
571 FROM $schema.auths_$batch
574 AND new_auth_id IS NULL
576 AND cancelled_auth_id IS NULL
580 }, {}, "auths_$batch");
582 UPDATE $schema.auths_$batch
584 new_auth_id = CURRVAL('authority.record_entry_id_seq')
587 FROM $schema.auths_$batch
590 AND new_auth_id IS NULL
592 AND cancelled_auth_id IS NULL
601 sub handle_overlay_auths_stage1 {
606 my $getct = $dbh->prepare(qq{
608 FROM $schema.auths_$batch
612 AND heading = existing_heading
615 my $max = $getct->fetchrow_arrayref()->[0];
616 report_progress('Number of auths to update', $max);
619 UPDATE config.internal_flag SET enabled = TRUE
620 WHERE name = 'ingest.disable_authority_auto_update';
622 for (my $i = 1; $i <= $max; $i++) {
623 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
626 UPDATE authority.record_entry a
629 FROM $schema.auths_$batch b
630 WHERE a.id = b.auth_id
633 FROM $schema.auths_$batch
637 AND heading = existing_heading
643 UPDATE $schema.auths_$batch
647 FROM $schema.auths_$batch
651 AND heading = existing_heading
659 UPDATE config.internal_flag SET enabled = FALSE
660 WHERE name = 'ingest.disable_authority_auto_update';
664 sub handle_overlay_auths_stage2 {
669 my $getct = $dbh->prepare(qq{
671 FROM $schema.auths_$batch
675 AND heading <> existing_heading
678 my $max = $getct->fetchrow_arrayref()->[0];
679 report_progress('Number of auths to update', $max);
681 for (my $i = 1; $i <= $max; $i++) {
682 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
685 UPDATE authority.record_entry a
688 FROM $schema.auths_$batch b
689 WHERE a.id = b.auth_id
692 FROM $schema.auths_$batch
696 AND heading <> existing_heading
702 UPDATE $schema.auths_$batch
706 FROM $schema.auths_$batch
710 AND heading <> existing_heading
719 sub handle_overlay_auths_stage3 {
724 my $getct = $dbh->prepare(qq{
726 FROM $schema.auths_$batch
731 auth_id = cancelled_auth_id
733 AND cancelled_auth_id IS NOT NULL
736 my $max = $getct->fetchrow_arrayref()->[0];
737 report_progress('Number of auths to update', $max);
739 for (my $i = 1; $i <= $max; $i++) {
740 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
743 UPDATE authority.record_entry a
746 FROM $schema.auths_$batch b
747 WHERE a.id = b.cancelled_auth_id
748 AND cancelled_auth_id IN (
750 FROM $schema.auths_$batch
755 auth_id = cancelled_auth_id
757 AND cancelled_auth_id IS NOT NULL
763 UPDATE $schema.auths_$batch
765 WHERE cancelled_auth_id IN (
766 SELECT cancelled_auth_id
767 FROM $schema.auths_$batch
772 auth_id = cancelled_auth_id
774 AND cancelled_auth_id IS NOT NULL
783 sub handle_link_auth_auth {
789 UPDATE config.internal_flag SET enabled = TRUE
790 WHERE name = 'ingest.disable_authority_auto_update';
793 my $sth = $dbh->prepare(qq{
794 SELECT COALESCE(new_auth_id, auth_id, cancelled_auth_id) AS id
795 FROM $schema.auths_$batch
800 my @ids = map { $_->{id} } @{ $sth->fetchall_arrayref({}) };
802 report_progress(scalar(@ids) . " records to do auth-auth linking");
803 foreach my $id (@ids) {
805 report_progress('... auth-auth linkings processed', $i) if 0 == $i % 10 or $i == scalar(@ids);
806 system "/openils/bin/authority_authority_linker.pl -r $id -c /openils/conf/opensrf_core.xml";
810 UPDATE config.internal_flag SET enabled = FALSE
811 WHERE name = 'ingest.disable_authority_auto_update';
815 sub handle_link_auth_bib {
819 my $link_skipped = shift;
827 AND skip_reason ~ '^edit'
839 my $sth = $dbh->prepare($query);
841 my @ids = map { $_->{id} } @{ $sth->fetchall_arrayref({}) };
843 report_progress(scalar(@ids) . " records to do auth-bib linking");
844 foreach my $id (@ids) {
846 report_progress('... auth-bib linkings processed', $i) if 0 == $i % 10 or $i == scalar(@ids);
847 system "/openils/bin/authority_control_fields.pl --record $id -c /openils/conf/opensrf_core.xml";
852 sub handle_export_skipped_bibs {
859 open($outfh, '>', $output) or die("Could not open input file $output: $!\n");
860 binmode $outfh, ':utf8';
862 my $sth = $dbh->prepare(qq{
865 WHERE skip_reason ~ '^edit'
870 while (my $row = $sth->fetchrow_hashref()) {
871 my $marc = MARC::Record->new_from_xml($row->{marc});
872 print $outfh $marc->as_usmarc();
877 sub handle_export_skipped_auths {
884 open($outfh, '>', $output) or die("Could not open input file $output: $!\n");
885 binmode $outfh, ':utf8';
887 my $sth = $dbh->prepare(qq{
889 FROM $schema.auths_$batch
895 while (my $row = $sth->fetchrow_hashref()) {
896 my $marc = MARC::Record->new_from_xml($row->{marc});
897 print $outfh $marc->as_usmarc();