3 # Copyright (c) 2016 Equinox Software, Inc.
4 # Author: Galen Charlton <gmc@esilibrary.com>
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2, or (at your option)
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>
24 use MARC::File::XML (BinaryEncoding => 'utf8');
26 use OpenILS::Application::AppUtils;
29 my $schema = 'bib_loads';
39 'action:s' => \$action,
40 'schema:s' => \$schema,
42 'dbuser:s' => \$dbuser,
43 'dbhost:s' => \$dbhost,
46 'cutoff:s' => \$cutoff,
50 abort('must specify --action') unless defined $action;
51 abort('must specify --schema') unless defined $schema;
52 abort('must specify --db') unless defined $db;
53 abort('must specify --dbuser') unless defined $dbuser;
54 abort('must specify --dbhost') unless defined $dbhost;
55 abort('must specify --dbpw') unless defined $dbpw;
56 abort('must specify --batch') unless defined $batch;
58 abort(q{--action must be "stage_bibs", "filter_bibs", "load_bibs", "stage_auths",
59 "match_auths", "load_new_auths", "overlay_auths_stage1",
60 "overlay_auths_stage2", "overlay_auths_stage3", "link_auth_auth"}) unless
61 $action eq 'filter_bibs' or
62 $action eq 'stage_bibs' or
63 $action eq 'load_bibs' or
64 $action eq 'stage_auths' or
65 $action eq 'match_auths' or
66 $action eq 'load_new_auths' or
67 $action eq 'overlay_auths_stage1' or
68 $action eq 'overlay_auths_stage2' or
69 $action eq 'overlay_auths_stage3' or
70 $action eq 'link_auth_auth'
73 my $dbh = connect_db($db, $dbuser, $dbpw, $dbhost);
75 if ($action eq 'stage_bibs') {
76 abort('must specify at least one input file') unless @ARGV;
77 handle_stage_bibs($dbh, $schema, $batch);
80 if ($action eq 'filter_bibs') {
81 abort('must specify cutoff date when filtering') unless defined $cutoff;
82 handle_filter_bibs($dbh, $schema, $batch, $cutoff);
85 if ($action eq 'load_bibs' ) {
86 handle_load_bibs($dbh, $schema, $batch, $wait);
89 if ($action eq 'stage_auths') {
90 abort('must specify at least one input file') unless @ARGV;
91 handle_stage_auths($dbh, $schema, $batch);
94 if ($action eq 'match_auths') {
95 handle_match_auths($dbh, $schema, $batch);
98 if ($action eq 'load_new_auths') {
99 handle_load_new_auths($dbh, $schema, $batch);
102 if ($action eq 'overlay_auths_stage1') {
103 handle_overlay_auths_stage1($dbh, $schema, $batch);
105 if ($action eq 'overlay_auths_stage2') {
106 handle_overlay_auths_stage2($dbh, $schema, $batch);
108 if ($action eq 'overlay_auths_stage3') {
109 handle_overlay_auths_stage3($dbh, $schema, $batch);
112 if ($action eq 'link_auth_auth') {
113 handle_link_auth_auth($dbh, $schema, $batch);
118 print STDERR "$0: $msg", "\n";
126 Utility to stage and overlay bib records in an Evergreen database. This
127 expects that the incoming records will have been previously exported
128 from that Evergreen database and modified in some fashion (e.g., for
129 authority record processing) and that the bib ID can be found in the
132 This program has several modes controlled by the --action switch:
134 --action stage_bibs - load MARC bib records into a staging table
135 --action filter_bibs - mark previously staged bibs that should
136 be excluded from a subsequent load, either
137 because the target bib is deleted in Evergreen
138 or the record was modified after a date
139 specified by the --cutoff switch
140 --action load_bibs - overlay bib records using a previously staged
141 batch, one at a time. After each bib, it will
142 wait the number of seconds specified by the
145 --action stage_auths - load MARC authorities into staging
147 --action match_auths - identify matches with authority
148 records already present in the
149 Evergreen database; matching is
150 based on LCCN, cancelled LCCN, and
152 --action load_new_auths - load new (unmatched) authorities
153 --action overlay_auths_stage1 - overlay based on LCCN where
154 heading has not change; this step
155 disables propagation to bib records
156 --action overlay_auths_stage2 - overlay based on LCCN where heading
157 has NOT changed; propagates changes
159 --action overlay_auths_stage3 - overlay for records where a cancelled
160 LCCN is replaced with a new one
161 --action link_auth_auth - run authority_authority_linker.pl for
162 the authorities that were overlaid
163 or added in this batch.
165 Several switches are used regardless of the specified action:
167 --schema - Pg schema in which staging table will live; should be
169 --batch - name of bib batch; will also be used as the name
170 of the staging tables
172 --dbuser - database user
173 --dbpw - database password
174 --dbhost - database host
178 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
179 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
180 --action stage_bibs -- file1.mrc file2.mrc [...]
182 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
183 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
184 --action filter_bibs --cutoff 2016-01-02
186 $0 --schema bib_load --batch bibs_2016_01 --db evergreen \\
187 --dbuser evergreen --dbpw evergreen --dbhost localhost \\
188 --action load_bibs --wait 2
194 sub report_progress {
195 my ($msg, $counter) = @_;
196 if (defined $counter) {
197 print STDERR "$msg: $counter\n";
199 print STDERR "$msg\n";
204 my ($db, $dbuser, $dbpw, $dbhost) = @_;
206 my $dsn = "dbi:Pg:host=$dbhost;dbname=$db;port=5432";
209 ShowErrorStatement => 1,
214 my $dbh = DBI->connect($dsn, $dbuser, $dbpw, $attrs);
219 sub handle_stage_bibs {
225 DROP TABLE IF EXISTS $schema.$batch;
228 CREATE TABLE $schema.$batch (
232 imported BOOLEAN DEFAULT FALSE,
233 to_import BOOLEAN DEFAULT TRUE,
240 binmode STDIN, ':utf8';
241 my $ins = $dbh->prepare("INSERT INTO $schema.$batch (marc, bib_id) VALUES (?, ?)");
246 report_progress("Records staged", $i);
250 my $marc = MARC::Record->new_from_usmarc($_);
251 my $bibid = $marc->subfield('901', 'c');
252 if ($bibid !~ /^\d+$/) {
253 print STDERR "Record $i is suspect; skipping\n";
256 my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record());
257 $ins->execute($xml, $bibid);
260 report_progress("Records staged", $i) if 0 != $i % 100;
262 CREATE INDEX ${batch}_bib_id_idx ON
263 $schema.$batch (bib_id);
266 CREATE INDEX ${batch}_id_idx ON
271 sub handle_filter_bibs {
277 my $sth1 = $dbh->prepare(qq{
278 UPDATE $schema.$batch
279 SET to_import = FALSE,
280 skip_reason = 'deleted'
283 FROM biblio.record_entry
290 my $ct = $sth1->rows;
291 report_progress("Filtering out $ct records that are currently deleted");
293 my $sth2 = $dbh->prepare(qq{
294 UPDATE $schema.$batch
295 SET to_import = FALSE,
296 skip_reason = 'edited after cutoff of $cutoff'
299 FROM biblio.record_entry
305 $sth2->execute($cutoff);
307 report_progress("Filtering out $ct records edited after cutoff date of $cutoff");
309 my $sth3 = $dbh->prepare(qq{
310 UPDATE $schema.$batch
311 SET to_import = FALSE,
312 skip_reason = 'XML is not well-formed'
313 WHERE NOT xml_is_well_formed(marc)
319 report_progress("Filtering out $ct records whose XML is not well-formed");
322 sub handle_load_bibs {
328 my $getct = $dbh->prepare(qq{
335 my $max = $getct->fetchrow_arrayref()->[0];
337 report_progress('Number of bibs to update', $max);
338 for (my $i = 1; $i <= $max; $i++) {
339 report_progress('... bibs updated', $i) if 0 == $i % 10 or $i == $max;
342 UPDATE biblio.record_entry a
344 FROM $schema.$batch b
345 WHERE a.id = b.bib_id
356 UPDATE $schema.$batch
372 sub handle_stage_auths {
378 DROP TABLE IF EXISTS $schema.auths_$batch;
381 CREATE TABLE $schema.auths_$batch (
386 existing_heading TEXT,
389 cancelled_auth_id BIGINT,
391 lccn_matched BOOLEAN DEFAULT FALSE,
392 heading_matched BOOLEAN DEFAULT FALSE,
393 imported BOOLEAN DEFAULT FALSE,
394 to_import BOOLEAN DEFAULT TRUE,
401 binmode STDIN, ':utf8';
402 my $ins = $dbh->prepare(qq{
403 INSERT INTO $schema.auths_$batch (marc, auth_id, lccn, cancelled_lccn, heading)
404 VALUES (?, ?, ?, ?, authority.normalize_heading(?))
410 report_progress("Records staged", $i);
414 my $marc = MARC::Record->new_from_usmarc($_);
415 my $authid = $marc->subfield('901', 'c');
416 if (defined($authid) && $authid !~ /^\d+$/) {
419 my $lccn = $marc->subfield('010', 'a');
425 my $cancelled_lccn = $marc->subfield('010', 'z');
426 if (defined $cancelled_lccn) {
427 $cancelled_lccn =~ s/^\s+//;
428 $cancelled_lccn =~ s/\s+$//;
429 $cancelled_lccn =~ s/\s+/ /g;
431 my $xml = OpenILS::Application::AppUtils->entityize($marc->as_xml_record());
432 $ins->execute($xml, $authid, $lccn, $cancelled_lccn, $xml);
435 report_progress("Records staged", $i) if 0 != $i % 100;
437 CREATE INDEX auths_${batch}_auth_id_idx ON
438 $schema.auths_$batch (auth_id);
441 CREATE INDEX auths_${batch}_id_idx ON
442 $schema.auths_$batch (id);
445 CREATE INDEX auths_${batch}_lccn_idx ON
446 $schema.auths_$batch (lccn);
450 sub handle_match_auths {
451 my ($dbh, $schema, $batch) = @_;
453 my $sth = $dbh->prepare(qq{
454 UPDATE $schema.auths_${batch} a
455 SET auth_id = b.record,
457 existing_heading = authority.normalize_heading(c.marc)
458 FROM authority.full_rec b
459 JOIN authority.record_entry c ON (b.record = c.id)
464 AND lccn IS NOT NULL;
468 report_progress("Matched $ct authorities on LCCN");
470 $sth = $dbh->prepare(qq{
471 UPDATE $schema.auths_${batch} a
472 SET cancelled_auth_id = b.record
473 FROM authority.full_rec b
476 AND value = cancelled_lccn
478 AND cancelled_lccn IS NOT NULL;
482 report_progress("Matched $ct authorities on cancelled LCCN");
484 $sth = $dbh->prepare(qq{
485 UPDATE $schema.auths_$batch a
487 heading_matched = TRUE,
488 existing_heading = b.heading
489 FROM authority.record_entry b
490 WHERE a.heading = b.heading
495 report_progress("Matched $ct authorities on heading");
498 sub handle_load_new_auths {
503 my $getct = $dbh->prepare(qq{
505 FROM $schema.auths_$batch
508 AND new_auth_id IS NULL
510 AND cancelled_auth_id IS NULL
513 my $max = $getct->fetchrow_arrayref()->[0];
515 report_progress('Number of authorities to add', $max);
516 for (my $i = 1; $i <= $max; $i++) {
517 report_progress('... authorities added', $i) if 0 == $i % 10 or $i == $max;
520 INSERT INTO authority.record_entry (marc, last_xact_id)
521 SELECT marc, ? || '-' || id
522 FROM $schema.auths_$batch b
525 FROM $schema.auths_$batch
528 AND new_auth_id IS NULL
530 AND cancelled_auth_id IS NULL
534 }, {}, "auths_$batch");
536 UPDATE $schema.auths_$batch
538 new_auth_id = CURRVAL('authority.record_entry_id_seq')
541 FROM $schema.auths_$batch
544 AND new_auth_id IS NULL
546 AND cancelled_auth_id IS NULL
555 sub handle_overlay_auths_stage1 {
560 my $getct = $dbh->prepare(qq{
562 FROM $schema.auths_$batch
566 AND heading = existing_heading
569 my $max = $getct->fetchrow_arrayref()->[0];
570 report_progress('Number of auths to update', $max);
573 UPDATE config.internal_flag SET enabled = TRUE
574 WHERE name = 'ingest.disable_authority_auto_update';
576 for (my $i = 1; $i <= $max; $i++) {
577 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
580 UPDATE authority.record_entry a
583 FROM $schema.auths_$batch b
584 WHERE a.id = b.auth_id
587 FROM $schema.auths_$batch
591 AND heading = existing_heading
597 UPDATE $schema.auths_$batch
601 FROM $schema.auths_$batch
605 AND heading = existing_heading
613 UPDATE config.internal_flag SET enabled = FALSE
614 WHERE name = 'ingest.disable_authority_auto_update';
618 sub handle_overlay_auths_stage2 {
623 my $getct = $dbh->prepare(qq{
625 FROM $schema.auths_$batch
629 AND heading <> existing_heading
632 my $max = $getct->fetchrow_arrayref()->[0];
633 report_progress('Number of auths to update', $max);
635 for (my $i = 1; $i <= $max; $i++) {
636 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
639 UPDATE authority.record_entry a
642 FROM $schema.auths_$batch b
643 WHERE a.id = b.auth_id
646 FROM $schema.auths_$batch
650 AND heading <> existing_heading
656 UPDATE $schema.auths_$batch
660 FROM $schema.auths_$batch
664 AND heading <> existing_heading
673 sub handle_overlay_auths_stage3 {
678 my $getct = $dbh->prepare(qq{
680 FROM $schema.auths_$batch
685 auth_id = cancelled_auth_id
687 AND cancelled_auth_id IS NOT NULL
690 my $max = $getct->fetchrow_arrayref()->[0];
691 report_progress('Number of auths to update', $max);
693 for (my $i = 1; $i <= $max; $i++) {
694 report_progress('... auths updated', $i) if 0 == $i % 10 or $i == $max;
697 UPDATE authority.record_entry a
700 FROM $schema.auths_$batch b
701 WHERE a.id = b.cancelled_auth_id
702 AND cancelled_auth_id IN (
704 FROM $schema.auths_$batch
709 auth_id = cancelled_auth_id
711 AND cancelled_auth_id IS NOT NULL
717 UPDATE $schema.auths_$batch
719 WHERE cancelled_auth_id IN (
720 SELECT cancelled_auth_id
721 FROM $schema.auths_$batch
726 auth_id = cancelled_auth_id
728 AND cancelled_auth_id IS NOT NULL
737 sub handle_link_auth_auth {
743 UPDATE config.internal_flag SET enabled = TRUE
744 WHERE name = 'ingest.disable_authority_auto_update';
747 my $sth = $dbh->prepare(qq{
748 SELECT COALESCE(new_auth_id, auth_id, cancelled_auth_id) AS id
749 FROM $schema.auths_$batch
754 my @ids = map { $_->{id} } @{ $sth->fetchall_arrayref({}) };
756 report_progress(scalar(@ids) . " records to do auth-auth linking");
757 foreach my $id (@ids) {
759 report_progress('... auth-auth linkings processed', $i) if 0 == $i % 10 or $i == scalar(@ids);
760 system "/openils/bin/authority_authority_linker.pl -r $id -c /openils/conf/opensrf_core.xml";
764 UPDATE config.internal_flag SET enabled = FALSE
765 WHERE name = 'ingest.disable_authority_auto_update';