3 # Copyright (C) 2009-2014 Equinox Software, Inc.
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2, or (at your option)
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This script provides an example of how to
19 # process a file of MARC bib records and deduplicate
20 # them against bibs already present in an Evergreen
21 # database. The result in the output directory will
24 # bibs_to_add.xml - new bibs in MARCXML, converted to UTF8,
25 # with record IDs set. This is the set
26 # of bibs that do NOT match incumbent bibs,
27 # and which therefore should be loaded.
28 # old2new.map - two-column mapping from old ILS bib ID
29 # to pre-merge Evergreen bib ID
30 # merge.final - for the bibs that match ones already in
31 # the Evergreen database, a two-column mapping
32 # whose first column is the incumbent bib's
33 # Evergreen ID and whose second column is the
34 # placeholder Evergreen bib ID assigned to the
35 # bib in the input file. Joining old2new.map
36 # and merge.final will produce a mapping from the
37 # original ILS bib ID to the destination Evergreen
40 # Since bibs_to_add.xml contains the Evergreen bib ID to use when
41 # loading the bib record in the 903 field, the --idfield=903 option
42 # should be used when processing that file through marc2bre.pl.
44 # First, some initial settings
46 BIBFILE=/path/to/bib.mrc # file of MARC bib records
47 BIBCHARSET=MARC8 # character encoding of the bib records, typically
48 # either MARC8 or UTF8
49 BIBIDSTART=1000000 # starting bib ID to assign to new bibs
50 ORIGIDTAG=999 # MARC tag storing the original ILS's bib ID
51 ORIGIDSF=a # MARC subfield storing the original ILS's bib ID
53 # connection parameters for the Evergreen database
60 INTER=scratch # directory to store intermediate files
61 LOG=log # directory to store log files
62 OUT=out # directory to store output files
64 MIGTOOLS=$HOME/migration-tools # path to Git checkout of migration-tools
66 export PATH=$PATH:$MIGTOOLS:/openils/bin
67 export PERL5LIB=$PERL5LIB:$MIGTOOLS/Equinox-Migration/lib
69 # This function converts the source bib file to MARCXML,
70 # runs it through a cleanup process, and emits a mapping file
71 # from the source ILS bib ID to the (as-yet-undedupped) new Evergreen
73 function prepare_bibs {
77 echo ERROR: Could not read bib file $BIBFILE
81 echo Running yaz-marcdump:
82 yaz-marcdump -f $BIBCHARSET -t UTF-8 -l 9=97 -o marcxml $BIBFILE > $INTER/bibs_pass1.xml
83 echo yaz-marcdump is done
85 echo Running marc_cleanup:
87 marc_cleanup --marcfile=$INTER/bibs_pass1.xml --fullauto \
88 -o $INTER/bib.clean.xml -x $LOG/bib.precleanup.errors.xml --renumber-from $BIBIDSTART \
89 -ot $ORIGIDTAG -os $ORIGIDSF
91 echo marc_cleanup is done
93 # old2new.map is the source ILS bib ID to new Evergreen bib ID map
94 cp $INTER/old2new.map $OUT/old2new.map
98 # This function calculates "fingerprints" for all of the
99 # bibs in the input file as well as the bibs in the Evergreen
100 # database, then uses those fingerprints to identify duplicate
101 # records. The result is a file of bibs in MARCXML format that
102 # should be laoded into the database.
103 function calculate_duplicates {
105 echo "select id || chr(9) || REGEXP_REPLACE(marc, E'\\n','','g') from biblio.record_entry where not deleted and id < $BIBIDSTART" > $INTER/incumbent_bibs.sql
107 echo Extracting incumbent bibs:
108 PGPASSWORD=$DBPASS psql -h $DBHOST -A -t -U $DBUSER $DBNAME < $BIN/incumbent_bibs.sql | munge_marc_export_for_fingerprint.pl > $INTER/incumbent.mrc
111 echo fingerprinter on incumbent bibs:
112 fingerprinter --fingerprints oclc,isbn,edition,issn,lccn,accomp,authpub \
113 -o $INTER/incumbent.fp -x $INTER/incumbent.fp.ex $INTER/incumbent.mrc
116 echo fingerprinter on new bibs:
117 fingerprinter --fingerprints oclc,isbn,edition,issn,lccn,accomp,authpub \
118 -o $INTER/new.fp -x err/new.fp.ex $INTER/bib.clean.xml
121 echo Merging fingerprints:
123 cat $INTER/incumbent.fp $INTER/new.fp | sort -r > $INTER/dedupe.fp
124 match_fingerprints -t $BIBIDSTART -o $INTER/merge $INTER/dedupe.fp
126 for i in isbn authpub lccn oclc issn edition
129 grep $i $INTER/dedupe.fp > $INTER/$i.fp
130 match_fingerprints -t $BIBIDSTART -o $INTER/merge-$i $INTER/$i.fp
133 echo ...combining all of the above
134 cat $INTER/merge $INTER/merge-isbn $INTER/merge-authpub \
135 $INTER/merge-lccn $INTER/merge-oclc $INTER/merge-edition | \
136 sort | uniq > $INTER/merge-combined
137 cleanup_merge_map.pl $INTER/merge-combined > $OUT/merge.final
139 echo Dedupe merge map: $OUT/merge.final
141 echo extract_loadset:
142 extract_loadset -l 1 -i $INTER/bib.clean.xml -o $OUT/bibs_to_add.xml $OUT/merge.final
144 echo Done with fingerprinting.
149 # actually run the process