3 # Copyright (c) 2016 Equinox Software, Inc.
4 # Author: Galen Charlton <gmc@esilibrary.com>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2, or (at your option)
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
24 use MARC::File::XML (BinaryEncoding => 'utf8');
26 use OpenILS::Application::AppUtils;
29 my $schema = 'bib_loads';
40 'action:s' => \$action,
41 'schema:s' => \$schema,
43 'dbuser:s' => \$dbuser,
44 'dbhost:s' => \$dbhost,
47 'cutoff:s' => \$cutoff,
49 'output:s' => \$output,
52 abort('must specify --action') unless defined $action;
53 abort('must specify --schema') unless defined $schema;
54 abort('must specify --db') unless defined $db;
55 abort('must specify --dbuser') unless defined $dbuser;
56 abort('must specify --dbhost') unless defined $dbhost;
57 abort('must specify --dbpw') unless defined $dbpw;
58 abort('must specify --batch') unless defined $batch;
60 abort(q{--action must be "stage_bibs", "filter_bibs", "load_bibs", "stage_auths",
61 "match_auths", "load_new_auths", "overlay_auths_stage1",
62 "overlay_auths_stage2", "overlay_auths_stage3", "link_auth_auth",
63 "link_auth_bib", "export_skipped_bibs", or "export_skipped_auths"}) unless
64 $action eq 'filter_bibs' or
65 $action eq 'stage_bibs' or
66 $action eq 'load_bibs' or
67 $action eq 'stage_auths' or
68 $action eq 'match_auths' or
69 $action eq 'load_new_auths' or
70 $action eq 'overlay_auths_stage1' or
71 $action eq 'overlay_auths_stage2' or
72 $action eq 'overlay_auths_stage3' or
73 $action eq 'link_auth_auth' or
74 $action eq 'link_auth_bib' or
75 $action eq 'export_skipped_bibs' or
76 $action eq 'export_skipped_auths'
79 my $dbh = connect_db($db, $dbuser, $dbpw, $dbhost);
81 if ($action eq 'stage_bibs') {
82 abort('must specify at least one input file') unless @ARGV;
83 handle_stage_bibs($dbh, $schema, $batch);
86 if ($action eq 'filter_bibs') {
87 abort('must specify cutoff date when filtering') unless defined $cutoff;
88 handle_filter_bibs($dbh, $schema, $batch, $cutoff);
91 if ($action eq 'load_bibs' ) {
92 handle_load_bibs($dbh, $schema, $batch, $wait);
95 if ($action eq 'stage_auths') {
96 abort('must specify at least one input file') unless @ARGV;
97 handle_stage_auths($dbh, $schema, $batch);
100 if ($action eq 'match_auths') {
101 handle_match_auths($dbh, $schema, $batch);
104 if ($action eq 'load_new_auths') {
105 handle_load_new_auths($dbh, $schema, $batch);
108 if ($action eq 'overlay_auths_stage1') {
109 handle_overlay_auths_stage1($dbh, $schema, $batch);
111 if ($action eq 'overlay_auths_stage2') {
112 handle_overlay_auths_stage2($dbh, $schema, $batch);
114 if ($action eq 'overlay_auths_stage3') {
115 handle_overlay_auths_stage3($dbh, $schema, $batch);
118 if ($action eq 'link_auth_auth') {
119 handle_link_auth_auth($dbh, $schema, $batch);
121 if ($action eq 'link_auth_bib') {
122 handle_link_auth_bib($dbh, $schema, $batch);
125 if ($action eq 'export_skipped_bibs') {
126 abort('must specify output file') unless defined $output;
127 handle_export_skipped_bibs($dbh, $schema, $batch, $output);
129 if ($action eq 'export_skipped_auths') {
130 abort('must specify output file') unless defined $output;
131 handle_export_skipped_auths($dbh, $schema, $batch, $output);
136 print STDERR "$0: $msg", "\n";
144 Utility to stage and overlay bib records in an Evergreen database. This
145 expects that the incoming records will have been previously exported
146 from that Evergreen database and modified in some fashion (e.g., for
147 authority record processing) and that the bib ID can be found in the
150 This program has several modes controlled by the --action switch:
152 --action stage_bibs - load MARC bib records into a staging table
153 --action filter_bibs - mark previously staged bibs that should
154 be excluded from a subsequent load, either
155 because the target bib is deleted in Evergreen
156 or the record was modified after a date
157 specified by the --cutoff switch
158 --action load_bibs - overlay bib records using a previously staged
159 batch, one at a time. After each bib, it will
160 wait the number of seconds specified by the
163 --action stage_auths - load MARC authorities into staging
165 --action match_auths - identify matches with authority
166 records already present in the
167 Evergreen database; matching is
168 based on LCCN, cancelled LCCN, and
170 --action load_new_auths - load new (unmatched) authorities
171 --action overlay_auths_stage1 - overlay based on LCCN where
172 heading has NOT changed; this step
173 disables propagation to bib records
174 --action overlay_auths_stage2 - overlay based on LCCN where heading
175 HAS changed; propagates changes
177 --action overlay_auths_stage3 - overlay for records where a cancelled
178 LCCN is replaced with a new one
179 --action link_auth_auth - run authority_authority_linker.pl for
180 the authorities that were overlaid
181 or added in this batch.
182 --action link_auth_bib - run authority_control_fields.pl for
183 the bibs that were overlaid in this
185 --action export_skipped_bibs - export to ISO2709 file whose name is
186 specified by --output those bibs
187 that had been edited after the cutoff.
188 --action export_skipped_auths - export to ISO2709 file whose name is
189 specified by --output those authorities
190 that could not be definitively
191 handled as updates or adds.
193 Several switches are used regardless of the specified action:
195 --schema - Pg schema in which staging table will live; should be
197 --batch - name of bib batch; will also be used as the name
198 of the staging tables
200 --dbuser - database user
201 --dbpw - database password
202 --dbhost - database host
206 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
207 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
208 --action stage_bibs -- file1.mrc file2.mrc [...]
210 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
211 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
212 --action filter_bibs --cutoff 2016-01-02
214 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
215 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
216 --action load_bibs --wait 2
222 sub report_progress {
223 my ($msg, $counter) = @_;
224 if (defined $counter) {
225 print STDERR "$msg: $counter\n";
227 print STDERR "$msg\n";
232 my ($db, $dbuser, $dbpw, $dbhost) = @_;
234 my $dsn = "dbi:Pg:host=$dbhost;dbname=$db;port=5432";
237 ShowErrorStatement => 1,
242 my $dbh = DBI->connect($dsn, $dbuser, $dbpw, $attrs);
247 sub handle_stage_bibs {
253 DROP TABLE IF EXISTS $schema.$batch;
256 CREATE TABLE $schema.$batch (
260 imported BOOLEAN DEFAULT FALSE,
261 to_import BOOLEAN DEFAULT TRUE,
268 binmode STDIN, ':utf8';
269 my $ins = $dbh->prepare("INSERT INTO $schema.$batch (marc, bib_id) VALUES (?, ?)");
274 report_progress("Records staged", $i);
278 my $marc = MARC::Record->new_from_usmarc($_);
279 my $bibid = $marc->subfield('901', 'c');
280 if ($bibid !~ /^\d+$/) {
281 print STDERR "Record $i is suspect; skipping\n";
284 my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record());
285 $ins->execute($xml, $bibid);
288 report_progress("Records staged", $i) if 0 != $i % 100;
290 CREATE INDEX ${batch}_bib_id_idx ON
291 $schema.$batch (bib_id);
294 CREATE INDEX ${batch}_id_idx ON
299 sub handle_filter_bibs {
305 my $sth1 = $dbh->prepare(qq{
306 UPDATE $schema.$batch
307 SET to_import = FALSE,
308 skip_reason = 'deleted'
311 FROM biblio.record_entry
318 my $ct = $sth1->rows;
319 report_progress("Filtering out $ct records that are currently deleted");
321 my $sth2 = $dbh->prepare(qq{
322 UPDATE $schema.$batch
323 SET to_import = FALSE,
324 skip_reason = 'edited after cutoff of $cutoff'
327 FROM biblio.record_entry
333 $sth2->execute($cutoff);
335 report_progress("Filtering out $ct records edited after cutoff date of $cutoff");
337 my $sth3 = $dbh->prepare(qq{
338 UPDATE $schema.$batch
339 SET to_import = FALSE,
340 skip_reason = 'XML is not well-formed'
341 WHERE NOT xml_is_well_formed(marc)
347 report_progress("Filtering out $ct records whose XML is not well-formed");
350 sub handle_load_bibs {
356 my $getct = $dbh->prepare(qq{
363 my $max = $getct->fetchrow_arrayref()->[0];
365 report_progress('Number of bibs to update', $max);
366 for (my $i = 1; $i <= $max; $i++) {
367 report_progress('... bibs updated', $i) if 0 == $i % 10 or $i == $max;
370 UPDATE biblio.record_entry a
372 FROM $schema.$batch b
373 WHERE a.id = b.bib_id
384 UPDATE $schema.$batch
400 sub handle_stage_auths {
406 DROP TABLE IF EXISTS $schema.auths_$batch;
409 CREATE TABLE $schema.auths_$batch (
414 existing_heading TEXT,
417 cancelled_auth_id BIGINT,
419 lccn_matched BOOLEAN DEFAULT FALSE,
420 heading_matched BOOLEAN DEFAULT FALSE,
421 imported BOOLEAN DEFAULT FALSE,
422 to_import BOOLEAN DEFAULT TRUE,
429 binmode STDIN, ':utf8';
430 my $ins = $dbh->prepare(qq{
431 INSERT INTO $schema.auths_$batch (marc, auth_id, lccn, cancelled_lccn, heading)
432 VALUES (?, ?, ?, ?, authority.normalize_heading(?))
438 report_progress("Records staged", $i);
442 my $marc = MARC::Record->new_from_usmarc($_);
443 my $authid = $marc->subfield('901', 'c');
444 if (defined($authid) && $authid !~ /^\d+$/) {
447 my $lccn = $marc->subfield('010', 'a');
453 my $cancelled_lccn = $marc->subfield('010', 'z');
454 if (defined $cancelled_lccn) {
455 $cancelled_lccn =~ s/^\s+//;
456 $cancelled_lccn =~ s/\s+$//;
457 $cancelled_lccn =~ s/\s+/ /g;
459 my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record());
460 $ins->execute($xml, $authid, $lccn, $cancelled_lccn, $xml);
463 report_progress("Records staged", $i) if 0 != $i % 100;
465 CREATE INDEX auths_${batch}_auth_id_idx ON
466 $schema.auths_$batch (auth_id);
469 CREATE INDEX auths_${batch}_id_idx ON
470 $schema.auths_$batch (id);
473 CREATE INDEX auths_${batch}_lccn_idx ON
474 $schema.auths_$batch (lccn);
478 sub handle_match_auths {
479 my ($dbh, $schema, $batch) = @_;
481 my $sth = $dbh->prepare(qq{
482 UPDATE $schema.auths_${batch} a
483 SET auth_id = b.record,
485 existing_heading = authority.normalize_heading(c.marc)
486 FROM authority.full_rec b
487 JOIN authority.record_entry c ON (b.record = c.id)
492 AND lccn IS NOT NULL;
496 report_progress("Matched $ct authorities on LCCN");
498 $sth = $dbh->prepare(qq{
499 UPDATE $schema.auths_${batch} a
500 SET cancelled_auth_id = b.record
501 FROM authority.full_rec b
504 AND value = cancelled_lccn
506 AND cancelled_lccn IS NOT NULL;
510 report_progress("Matched $ct authorities on cancelled LCCN");
512 $sth = $dbh->prepare(qq{
513 UPDATE $schema.auths_$batch a
515 heading_matched = TRUE,
516 existing_heading = b.heading
517 FROM authority.record_entry b
518 WHERE a.heading = b.heading
523 report_progress("Matched $ct authorities on heading");
526 sub handle_load_new_auths {
531 my $getct = $dbh->prepare(qq{
533 FROM $schema.auths_$batch
536 AND new_auth_id IS NULL
538 AND cancelled_auth_id IS NULL
541 my $max = $getct->fetchrow_arrayref()->[0];
543 report_progress('Number of authorities to add', $max);
544 for (my $i = 1; $i <= $max; $i++) {
545 report_progress('... authorities added', $i) if 0 == $i % 10 or $i == $max;
548 INSERT INTO authority.record_entry (marc, last_xact_id)
549 SELECT marc, ? || '-' || id
550 FROM $schema.auths_$batch b
553 FROM $schema.auths_$batch
556 AND new_auth_id IS NULL
558 AND cancelled_auth_id IS NULL
562 }, {}, "auths_$batch");
564 UPDATE $schema.auths_$batch
566 new_auth_id = CURRVAL('authority.record_entry_id_seq')
569 FROM $schema.auths_$batch
572 AND new_auth_id IS NULL
574 AND cancelled_auth_id IS NULL
583 sub handle_overlay_auths_stage1 {
588 my $getct = $dbh->prepare(qq{
590 FROM $schema.auths_$batch
594 AND heading = existing_heading
597 my $max = $getct->fetchrow_arrayref()->[0];
598 report_progress('Number of auths to update', $max);
601 UPDATE config.internal_flag SET enabled = TRUE
602 WHERE name = 'ingest.disable_authority_auto_update';
604 for (my $i = 1; $i <= $max; $i++) {
605 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
608 UPDATE authority.record_entry a
611 FROM $schema.auths_$batch b
612 WHERE a.id = b.auth_id
615 FROM $schema.auths_$batch
619 AND heading = existing_heading
625 UPDATE $schema.auths_$batch
629 FROM $schema.auths_$batch
633 AND heading = existing_heading
641 UPDATE config.internal_flag SET enabled = FALSE
642 WHERE name = 'ingest.disable_authority_auto_update';
646 sub handle_overlay_auths_stage2 {
651 my $getct = $dbh->prepare(qq{
653 FROM $schema.auths_$batch
657 AND heading <> existing_heading
660 my $max = $getct->fetchrow_arrayref()->[0];
661 report_progress('Number of auths to update', $max);
663 for (my $i = 1; $i <= $max; $i++) {
664 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
667 UPDATE authority.record_entry a
670 FROM $schema.auths_$batch b
671 WHERE a.id = b.auth_id
674 FROM $schema.auths_$batch
678 AND heading <> existing_heading
684 UPDATE $schema.auths_$batch
688 FROM $schema.auths_$batch
692 AND heading <> existing_heading
701 sub handle_overlay_auths_stage3 {
706 my $getct = $dbh->prepare(qq{
708 FROM $schema.auths_$batch
713 auth_id = cancelled_auth_id
715 AND cancelled_auth_id IS NOT NULL
718 my $max = $getct->fetchrow_arrayref()->[0];
719 report_progress('Number of auths to update', $max);
721 for (my $i = 1; $i <= $max; $i++) {
722 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
725 UPDATE authority.record_entry a
728 FROM $schema.auths_$batch b
729 WHERE a.id = b.cancelled_auth_id
730 AND cancelled_auth_id IN (
732 FROM $schema.auths_$batch
737 auth_id = cancelled_auth_id
739 AND cancelled_auth_id IS NOT NULL
745 UPDATE $schema.auths_$batch
747 WHERE cancelled_auth_id IN (
748 SELECT cancelled_auth_id
749 FROM $schema.auths_$batch
754 auth_id = cancelled_auth_id
756 AND cancelled_auth_id IS NOT NULL
765 sub handle_link_auth_auth {
771 UPDATE config.internal_flag SET enabled = TRUE
772 WHERE name = 'ingest.disable_authority_auto_update';
775 my $sth = $dbh->prepare(qq{
776 SELECT COALESCE(new_auth_id, auth_id, cancelled_auth_id) AS id
777 FROM $schema.auths_$batch
782 my @ids = map { $_->{id} } @{ $sth->fetchall_arrayref({}) };
784 report_progress(scalar(@ids) . " records to do auth-auth linking");
785 foreach my $id (@ids) {
787 report_progress('... auth-auth linkings processed', $i) if 0 == $i % 10 or $i == scalar(@ids);
788 system "/openils/bin/authority_authority_linker.pl -r $id -c /openils/conf/opensrf_core.xml";
792 UPDATE config.internal_flag SET enabled = FALSE
793 WHERE name = 'ingest.disable_authority_auto_update';
797 sub handle_link_auth_bib {
802 my $sth = $dbh->prepare(qq{
809 my @ids = map { $_->{id} } @{ $sth->fetchall_arrayref({}) };
811 report_progress(scalar(@ids) . " records to do auth-bib linking");
812 foreach my $id (@ids) {
814 report_progress('... auth-bib linkings processed', $i) if 0 == $i % 10 or $i == scalar(@ids);
815 system "/openils/bin/authority_control_fields.pl --record $id -c /openils/conf/opensrf_core.xml";
820 sub handle_export_skipped_bibs {
827 open($outfh, '>', $output) or die("Could not open input file $output: $!\n");
828 binmode $outfh, ':utf8';
830 my $sth = $dbh->prepare(qq{
833 WHERE skip_reason ~ '^edit'
838 while (my $row = $sth->fetchrow_hashref()) {
839 my $marc = MARC::Record->new_from_xml($row->{marc});
840 print $outfh $marc->as_usmarc();
845 sub handle_export_skipped_auths {
852 open($outfh, '>', $output) or die("Could not open input file $output: $!\n");
853 binmode $outfh, ':utf8';
855 my $sth = $dbh->prepare(qq{
857 FROM $schema.auths_$batch
863 while (my $row = $sth->fetchrow_hashref()) {
864 my $marc = MARC::Record->new_from_xml($row->{marc});
865 print $outfh $marc->as_usmarc();