1 -- SCLENDS bibliographic dedupe routine
3 -- Copyright 2010-2011 Equinox Software, Inc.
4 -- Author: Galen Charlton <gmc@esilibrary.com>
6 -- This program is free software; you can redistribute it and/or modify
7 -- it under the terms of the GNU General Public License as published by
8 -- the Free Software Foundation; either version 2, or (at your option)
11 -- This program is distributed in the hope that it will be useful,
12 -- but WITHOUT ANY WARRANTY; without even the implied warranty of
13 -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 -- GNU General Public License for more details.
16 -- You should have received a copy of the GNU General Public License
17 -- along with this program. If not, see <http://www.gnu.org/licenses/>.
20 -- This implements a bibliographic deduplication routine based
21 -- on criteria and an algorithm specified by the South Carolina
22 -- State Library on behalf of the SC LENDS consortium. This work
23 -- was sponsored by SC LENDS, whose impetus is gratefully
24 -- acknowledged. Portions of this script were subseqently expanded
25 -- based on the advice of the Indiana State Library on the behalf
26 -- of the Evergreen Indiana project.
28 -- schema to store the dedupe routine and intermediate data
29 CREATE SCHEMA m_dedupe;
31 CREATE TYPE mig_isbn_match AS (norm_isbn TEXT, norm_title TEXT, qual TEXT, bibid BIGINT);
33 -- function to calculate the normalized ISBN and title match keys
34 -- and the bibliographic portion of the quality score. The normalized
35 -- ISBN key consists of the set of 020$a and 020$z normalized as follows:
36 -- * numeric portion of the ISBN converted to ISBN-13 format
38 -- The normalized title key is taken FROM the 245$a with the nonfiling
39 -- characters and leading and trailing whitespace removed, ampersands
40 -- converted to ' and ', other punctuation removed, and the text converted
43 -- The quality score is a 19-digit integer computed by concatenating
44 -- counts of various attributes in the MARC records; see the get_quality
45 -- routine for details.
47 CREATE OR REPLACE FUNCTION m_dedupe.get_isbn_match_key (bib_id BIGINT, marc TEXT) RETURNS SETOF mig_isbn_match AS $func$
52 use MARC::File::XML (BinaryEncoding => 'utf8');
55 binmode(STDERR, ':bytes');
56 binmode(STDOUT, ':utf8');
57 binmode(STDERR, ':utf8');
59 my $get_quality = sub {
62 my $has003 = (scalar($marc->field('003'))) ? '1' : '0';
64 return join('', $has003,
65 count_field($marc, '02.'),
66 count_field($marc, '24.'),
67 field_length($marc, '300'),
68 field_length($marc, '100'),
69 count_field($marc, '010'),
70 count_field($marc, '50.', '51.', '52.', '53.', '54.', '55.', '56.', '57.', '58.'),
71 count_field($marc, '6..'),
72 count_field($marc, '440', '490', '830'),
73 count_field($marc, '7..'),
77 my ($bibid, $xml) = @_;
79 $xml =~ s/(<leader>.........)./${1}a/;
82 $marc = MARC::Record->new_from_xml($xml);
85 #elog("could not parse $bibid: $@\n");
86 import MARC::File::XML (BinaryEncoding => 'utf8');
90 my @f245 = $marc->field('245');
91 return unless @f245; # must have 245
92 my $norm_title = norm_title($f245[0]);
93 return unless $norm_title ne '';
95 my @isbns = $marc->field('020');
96 return unless @isbns; # must have at least 020
98 my $qual = $get_quality->($marc);
100 my @norm_isbns = norm_isbns(@isbns);
101 foreach my $isbn (@norm_isbns) {
102 return_next({ norm_isbn => $isbn, norm_title => $norm_title, qual => $qual, bibid => $bibid });
111 foreach my $tag (@tags) {
112 my @f = $marc->field($tag);
113 $total += scalar(@f);
115 $total = 99 if $total > 99;
116 return sprintf("%-02.2d", $total);
123 my @f = $marc->field($tag);
124 return '00' unless @f;
125 my $len = length($f[0]->as_string);
126 $len = 99 if $len > 99;
127 return sprintf("%-02.2d", $len);
132 my $sfa = $f245->subfield('a');
133 return '' unless defined $sfa;
134 my $nonf = $f245->indicator(2);
135 $nonf = '0' unless $nonf =~ /^\d$/;
141 $sfa = substr($sfa, $nonf);
145 $sfa =~ s/\[large print\]//;
146 $sfa =~ s/[[:punct:]]//g;
157 foreach my $field (@isbns) {
158 my $sfa = $field->subfield('a');
159 my $norm = norm_isbn($sfa);
160 $uniq_isbns{$norm}++ unless $norm eq '';
161 my $sfz = $field->subfield('z');
162 $norm = norm_isbn($sfz);
163 $uniq_isbns{$norm}++ unless $norm eq '';
165 return sort(keys %uniq_isbns);
171 return '' unless defined $str;
178 if ($str =~ /^(\d{12}[0-9-x])/) {
181 } elsif ($str =~ /^(\d{9}[0-9x])/) {
182 $isbn = Business::ISBN->new($1);
183 my $isbn13 = $isbn->as_isbn13;
184 $norm = lc($isbn13->as_string);
189 $func$ LANGUAGE PLPERLU;
191 -- Specify set of bibs to dedupe. This version
192 -- simply collects the IDs of all non-deleted bibs,
193 -- but the query could be expanded to exclude bibliographic
194 -- records that should not participate in the deduplication.
195 CREATE TABLE m_dedupe.bibs_to_check AS
197 FROM biblio.record_entry bre
200 -- staging table for the match keys
201 CREATE TABLE m_dedupe.match_keys (
208 -- calculate match keys
209 INSERT INTO m_dedupe.match_keys
210 SELECT (a.get_isbn_match_key::mig_isbn_match).norm_isbn,
211 (a.get_isbn_match_key::mig_isbn_match).norm_title,
212 (a.get_isbn_match_key::mig_isbn_match).qual,
213 (a.get_isbn_match_key::mig_isbn_match).bibid
215 SELECT m_dedupe.get_isbn_match_key(bre.id, bre.marc)
216 FROM biblio.record_entry bre
217 JOIN m_dedupe.bibs_to_check c ON (c.bib_id = bre.id)
220 CREATE INDEX norm_idx on m_dedupe.match_keys(norm_isbn, norm_title);
221 CREATE INDEX qual_idx on m_dedupe.match_keys(qual);
223 -- and remove duplicates
224 CREATE TEMPORARY TABLE uniq_match_keys AS
225 SELECT DISTINCT norm_isbn, norm_title, qual, bibid
226 FROM m_dedupe.match_keys;
228 DELETE FROM m_dedupe.match_keys;
229 INSERT INTO m_dedupe.match_keys SELECT * FROM uniq_match_keys;
231 -- find highest-quality match keys
232 CREATE TABLE m_dedupe.lead_quals AS
233 SELECT max(qual) as max_qual, norm_isbn, norm_title
234 FROM m_dedupe.match_keys
235 GROUP BY norm_isbn, norm_title
238 CREATE INDEX norm_idx2 ON m_dedupe.lead_quals(norm_isbn, norm_title);
239 CREATE INDEX norm_qual_idx2 ON m_dedupe.lead_quals(norm_isbn, norm_title, max_qual);
241 -- identify prospective lead bibs
242 CREATE TABLE m_dedupe.prospective_leads AS
243 SELECT bibid, a.norm_isbn, a.norm_title, b.max_qual, count(ac.id) as copy_count
244 FROM m_dedupe.match_keys a
245 JOIN m_dedupe.lead_quals b on (a.qual = b.max_qual and a.norm_isbn = b.norm_isbn and a.norm_title = b.norm_title)
246 JOIN asset.call_number acn on (acn.record = bibid)
247 JOIN asset.copy ac on (ac.call_number = acn.id)
248 WHERE not acn.deleted
250 GROUP BY bibid, a.norm_isbn, a.norm_title, b.max_qual;
252 -- and use number of copies to break ties
253 CREATE TABLE m_dedupe.best_lead_keys AS
254 SELECT norm_isbn, norm_title, max_qual, max(copy_count) AS copy_count
255 FROM m_dedupe.prospective_leads
256 GROUP BY norm_isbn, norm_title, max_qual;
258 CREATE TABLE m_dedupe.best_leads AS
259 SELECT bibid, a.norm_isbn, a.norm_title, a.max_qual, copy_count
260 FROM m_dedupe.best_lead_keys a
261 JOIN m_dedupe.prospective_leads b USING (norm_isbn, norm_title, max_qual, copy_count);
263 -- and break any remaining ties using the lowest bib ID as the winner
264 CREATE TABLE m_dedupe.unique_leads AS
265 SELECT MIN(bibid) AS lead_bibid, norm_isbn, norm_title, max_qual
266 FROM m_dedupe.best_leads
267 GROUP BY norm_isbn, norm_title, max_qual;
269 -- start computing the merge map
270 CREATE TABLE m_dedupe.merge_map_pre
271 AS SELECT distinct lead_bibid, bibid as sub_bibid
272 FROM m_dedupe.unique_leads
273 JOIN m_dedupe.match_keys using (norm_isbn, norm_title)
274 WHERE lead_bibid <> bibid;
276 -- and resolve transitive maps
277 UPDATE m_dedupe.merge_map_pre a
278 SET lead_bibid = b.lead_bibid
279 FROM m_dedupe.merge_map_pre b
280 WHERE a.lead_bibid = b.sub_bibid;
282 UPDATE m_dedupe.merge_map_pre a
283 SET lead_bibid = b.lead_bibid
284 FROM m_dedupe.merge_map_pre b
285 WHERE a.lead_bibid = b.sub_bibid;
287 UPDATE m_dedupe.merge_map_pre a
288 SET lead_bibid = b.lead_bibid
289 FROM m_dedupe.merge_map_pre b
290 WHERE a.lead_bibid = b.sub_bibid;
292 -- and produce the final merge map
293 CREATE TABLE m_dedupe.merge_map
294 AS SELECT min(lead_bibid) as lead_bibid, sub_bibid
295 FROM m_dedupe.merge_map_pre
298 -- add a unique ID to the merge map so that
299 -- we can do the actual record merging in chunks
300 ALTER TABLE m_dedupe.merge_map ADD COLUMN id serial, ADD COLUMN done BOOLEAN DEFAULT FALSE;
302 -- and here's an example of processing a chunk of a 1000
304 SELECT asset.merge_record_assets(lead_bibid, sub_bibid)
305 FROM m_dedupe.merge_map WHERE id in (
306 SELECT id FROM m_dedupe.merge_map
312 UPDATE m_dedupe.merge_map set done = true
314 SELECT id FROM m_dedupe.merge_map