From: Galen Charlton Date: Thu, 29 Jan 2015 22:51:08 +0000 (+0000) Subject: example script showing how to use 'fingerprinter' to deduplicate bib records X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=commitdiff_plain;h=e36038ee87bb072712e2d36321d3389e1ca52118 example script showing how to use 'fingerprinter' to deduplicate bib records Signed-off-by: Galen Charlton --- diff --git a/examples/bib_dedupe_and_load.sh b/examples/bib_dedupe_and_load.sh new file mode 100755 index 0000000..c3078b1 --- /dev/null +++ b/examples/bib_dedupe_and_load.sh @@ -0,0 +1,152 @@ +#!/bin/bash + +# Copyright (C) 2009-2014 Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# This script provides an example of how to +# process a file of MARC bib records and deduplicate +# them against bibs already present in an Evergreen +# database. The result in the output directory will +# be three files: +# +# bibs_to_add.xml - new bibs in MARCXML, converted to UTF8, +# with record IDs set. This is the set +# of bibs that do NOT match incumbent bibs, +# and which therefore should be loaded. +# old2new.map - two-column mapping from old ILS bib ID +# to pre-merge Evergreen bib ID +# merge.final - for the bibs that match ones already in +# the Evergreen database, a two-column mapping +# whose first column is the incumbent bib's +# Evergreen ID and whose second column is the +# placeholder Evergreen bib ID assigned to the +# bib in the input file. Joining old2new.map +# and merge.final will produce a mapping from the +# original ILS bib ID to the destination Evergreen +# bib ID. +# +# Since bibs_to_add.xml contains the Evergreen bib ID to use when +# loading the bib record in the 903 field, the --idfield=903 option +# should be used when processing that file through marc2bre.pl. + +# First, some initial settings + +BIBFILE=/path/to/bib.mrc # file of MARC bib records +BIBCHARSET=MARC8 # character encoding of the bib records, typically + # either MARC8 or UTF8 +BIBIDSTART=1000000 # starting bib ID to assign to new bibs +ORIGIDTAG=999 # MARC tag storing the original ILS's bib ID +ORIGIDSF=a # MARC subfield storing the original ILS's bib ID + +# connection parameters for the Evergreen database +DBHOST=localhost +DBPORT=5432 +DBNAME=evergreen +DBUSER=evergreen +DBPASS=evergreen + +INTER=scratch # directory to store intermediate files +LOG=log # directory to store log files +OUT=out # directory to store output files + +MIGTOOLS=$HOME/migration-tools # path to Git checkout of migration-tools + +export PATH=$PATH:$MIGTOOLS:/openils/bin +export PERL5LIB=$PERL5LIB:$MIGTOOLS/Equinox-Migration/lib + +# This function converts the source bib file to MARCXML, +# runs it through a cleanup process, and emits a mapping file +# from the source ILS bib ID to the (as-yet-undedupped) new Evergreen +# bib ID. +function prepare_bibs { + + if [ ! -r $BIBFILE ] + then + echo ERROR: Could not read bib file $BIBFILE + exit 1 + fi + + echo Running yaz-marcdump: + yaz-marcdump -f $BIBCHARSET -t UTF-8 -l 9=97 -o marcxml $BIBFILE > $INTER/bibs_pass1.xml + echo yaz-marcdump is done + + echo Running marc_cleanup: + pushd $INTER + marc_cleanup --marcfile=$INTER/bibs_pass1.xml --fullauto \ + -o $INTER/bib.clean.xml -x $LOG/bib.precleanup.errors.xml --renumber-from $BIBIDSTART \ + -ot $ORIGIDTAG -os $ORIGIDSF + popd + echo marc_cleanup is done + + # old2new.map is the source ILS bib ID to new Evergreen bib ID map + cp $INTER/old2new.map $OUT/old2new.map + +} + +# This function calculates "fingerprints" for all of the +# bibs in the input file as well as the bibs in the Evergreen +# database, then uses those fingerprints to identify duplicate +# records. The result is a file of bibs in MARCXML format that +# should be laoded into the database. +function calculate_duplicates { + + echo "select id || chr(9) || REGEXP_REPLACE(marc, E'\\n','','g') from biblio.record_entry where not deleted and id < $BIBIDSTART" > $INTER/incumbent_bibs.sql + + echo Extracting incumbent bibs: + PGPASSWORD=$DBPASS psql -h $DBHOST -A -t -U $DBUSER $DBNAME < $BIN/incumbent_bibs.sql | munge_marc_export_for_fingerprint.pl > $INTER/incumbent.mrc + + date + echo fingerprinter on incumbent bibs: + fingerprinter --fingerprints oclc,isbn,edition,issn,lccn,accomp,authpub \ + -o $INTER/incumbent.fp -x $INTER/incumbent.fp.ex $INTER/incumbent.mrc + + date + echo fingerprinter on new bibs: + fingerprinter --fingerprints oclc,isbn,edition,issn,lccn,accomp,authpub \ + -o $INTER/new.fp -x err/new.fp.ex $INTER/bib.clean.xml + + date + echo Merging fingerprints: + echo ...all + cat $INTER/incumbent.fp $INTER/new.fp | sort -r > $INTER/dedupe.fp + match_fingerprints -t $BIBIDSTART -o $INTER/merge $INTER/dedupe.fp + + for i in isbn authpub lccn oclc issn edition + do + echo ...$i + grep $i $INTER/dedupe.fp > $INTER/$i.fp + match_fingerprints -t $BIBIDSTART -o $INTER/merge-$i $INTER/$i.fp + done + + echo ...combining all of the above + cat $INTER/merge $INTER/merge-isbn $INTER/merge-authpub \ + $INTER/merge-lccn $INTER/merge-oclc $INTER/merge-edition | \ + sort | uniq > $INTER/merge-combined + cleanup_merge_map.pl $INTER/merge-combined > $OUT/merge.final + + echo Dedupe merge map: $OUT/merge.final + + echo extract_loadset: + extract_loadset -l 1 -i $INTER/bib.clean.xml -o $OUT/bibs_to_add.xml $OUT/merge.final + + echo Done with fingerprinting. + date + +} + +# actually run the process +prepare_bibs +calculate_duplicates +