examples/bib_dedupe_and_load.sh

   1 #!/bin/bash
   2
   3 # Copyright (C) 2009-2014 Equinox Software, Inc.
   4 #
   5 # This program is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 2, or (at your option)
   8 # any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 # This script provides an example of how to
  19 # process a file of MARC bib records and deduplicate
  20 # them against bibs already present in an Evergreen
  21 # database.  The result in the output directory will
  22 # be three files:
  23 #
  24 #   bibs_to_add.xml - new bibs in MARCXML, converted to UTF8,
  25 #                     with record IDs set.  This is the set
  26 #                     of bibs that do NOT match incumbent bibs,
  27 #                     and which therefore should be loaded.
  28 #   old2new.map     - two-column mapping from old ILS bib ID
  29 #                     to pre-merge Evergreen bib ID
  30 #   merge.final     - for the bibs that match ones already in
  31 #                     the Evergreen database, a two-column mapping
  32 #                     whose first column is the incumbent bib's
  33 #                     Evergreen ID and whose second column is the
  34 #                     placeholder Evergreen bib ID assigned to the
  35 #                     bib in the input file.  Joining old2new.map
  36 #                     and merge.final will produce a mapping from the
  37 #                     original ILS bib ID to the destination Evergreen
  38 #                     bib ID.
  39 #
  40 # Since bibs_to_add.xml contains the Evergreen bib ID to use when
  41 # loading the bib record in the 903 field, the --idfield=903 option
  42 # should be used when processing that file through marc2bre.pl.
  43
  44 # First, some initial settings
  45
  46 BIBFILE=/path/to/bib.mrc    # file of MARC bib records
  47 BIBCHARSET=MARC8            # character encoding of the bib records, typically
  48                             # either MARC8 or UTF8
  49 BIBIDSTART=1000000          # starting bib ID to assign to new bibs
  50 ORIGIDTAG=999               # MARC tag storing the original ILS's bib ID
  51 ORIGIDSF=a                  # MARC subfield storing the original ILS's bib ID
  52
  53 # connection parameters for the Evergreen database
  54 DBHOST=localhost
  55 DBPORT=5432
  56 DBNAME=evergreen
  57 DBUSER=evergreen
  58 DBPASS=evergreen
  59
  60 INTER=scratch               # directory to store intermediate files
  61 LOG=log                     # directory to store log files
  62 OUT=out                     # directory to store output files
  63
  64 MIGTOOLS=$HOME/migration-tools  # path to Git checkout of migration-tools
  65
  66 export PATH=$PATH:$MIGTOOLS:/openils/bin
  67 export PERL5LIB=$PERL5LIB:$MIGTOOLS/Equinox-Migration/lib
  68
  69 # This function converts the source bib file to MARCXML,
  70 # runs it through a cleanup process, and emits a mapping file
  71 # from the source ILS bib ID to the (as-yet-undedupped) new Evergreen
  72 # bib ID.
  73 function prepare_bibs {
  74
  75     if [ ! -r $BIBFILE ]
  76     then
  77         echo ERROR: Could not read bib file $BIBFILE
  78         exit 1
  79     fi
  80
  81     echo Running yaz-marcdump:
  82     yaz-marcdump -f $BIBCHARSET -t UTF-8 -l 9=97 -o marcxml $BIBFILE > $INTER/bibs_pass1.xml
  83     echo yaz-marcdump is done
  84
  85     echo Running marc_cleanup:
  86     pushd $INTER
  87     marc_cleanup --marcfile=$INTER/bibs_pass1.xml --fullauto \
  88         -o $INTER/bib.clean.xml -x $LOG/bib.precleanup.errors.xml --renumber-from $BIBIDSTART \
  89         -ot $ORIGIDTAG -os $ORIGIDSF
  90     popd
  91     echo marc_cleanup is done
  92
  93     # old2new.map is the source ILS bib ID to new Evergreen bib ID map
  94     cp $INTER/old2new.map $OUT/old2new.map
  95
  96 }
  97
  98 # This function calculates "fingerprints" for all of the
  99 # bibs in the input file as well as the bibs in the Evergreen
 100 # database, then uses those fingerprints to identify duplicate
 101 # records.  The result is a file of bibs in MARCXML format that
 102 # should be laoded into the database.
 103 function calculate_duplicates {
 104
 105     echo "select id || chr(9) || REGEXP_REPLACE(marc, E'\\n','','g') from biblio.record_entry where not deleted and id < $BIBIDSTART" > $INTER/incumbent_bibs.sql
 106
 107     echo Extracting incumbent bibs:
 108     PGPASSWORD=$DBPASS psql -h $DBHOST -A -t -U $DBUSER $DBNAME < $BIN/incumbent_bibs.sql | munge_marc_export_for_fingerprint.pl > $INTER/incumbent.mrc
 109
 110     date
 111     echo fingerprinter on incumbent bibs:
 112     fingerprinter --fingerprints oclc,isbn,edition,issn,lccn,accomp,authpub \
 113         -o $INTER/incumbent.fp -x $INTER/incumbent.fp.ex $INTER/incumbent.mrc
 114
 115     date
 116     echo fingerprinter on new bibs:
 117     fingerprinter --fingerprints oclc,isbn,edition,issn,lccn,accomp,authpub \
 118         -o $INTER/new.fp -x err/new.fp.ex $INTER/bib.clean.xml
 119
 120     date
 121     echo Merging fingerprints:
 122     echo ...all
 123     cat $INTER/incumbent.fp $INTER/new.fp | sort -r > $INTER/dedupe.fp
 124     match_fingerprints -t $BIBIDSTART -o $INTER/merge $INTER/dedupe.fp
 125
 126     for i in isbn authpub lccn oclc issn edition
 127     do
 128         echo ...$i
 129         grep $i $INTER/dedupe.fp > $INTER/$i.fp
 130         match_fingerprints -t $BIBIDSTART -o $INTER/merge-$i $INTER/$i.fp
 131     done
 132
 133     echo ...combining all of the above
 134     cat $INTER/merge $INTER/merge-isbn $INTER/merge-authpub \
 135         $INTER/merge-lccn  $INTER/merge-oclc $INTER/merge-edition | \
 136     sort | uniq > $INTER/merge-combined
 137     cleanup_merge_map.pl $INTER/merge-combined > $OUT/merge.final
 138
 139     echo Dedupe merge map: $OUT/merge.final
 140
 141     echo extract_loadset:
 142     extract_loadset -l 1 -i $INTER/bib.clean.xml -o $OUT/bibs_to_add.xml $OUT/merge.final
 143
 144     echo Done with fingerprinting.
 145     date
 146
 147 }
 148
 149 # actually run the process
 150 prepare_bibs
 151 calculate_duplicates
 152