From aa39b12d694a8206ff51a516dcbffb410e91f4a6 Mon Sep 17 00:00:00 2001 From: Shawn Boyette Date: Thu, 18 Dec 2008 18:34:31 +0000 Subject: [PATCH] cleaning up main dir --- Migration Data Work HOWTO.txt | 129 ---------------------- clear-db.sql | 140 ------------------------ database/clear-db.sql | 140 ++++++++++++++++++++++++ database/final.load-bib.sql | 80 ++++++++++++++ final.load-bib.sql | 80 -------------- notes.txt | 5 - split_unicorn_marc_holdings.pl | 67 ------------ transform_unicorn_flat_bills.pl | 51 --------- transform_unicorn_flat_charges.pl | 51 --------- unicorn_patron_xml2text.pl | 213 ------------------------------------- 10 files changed, 220 insertions(+), 736 deletions(-) delete mode 100644 Migration Data Work HOWTO.txt delete mode 100644 clear-db.sql create mode 100644 database/clear-db.sql create mode 100644 database/final.load-bib.sql delete mode 100644 final.load-bib.sql delete mode 100644 notes.txt delete mode 100755 split_unicorn_marc_holdings.pl delete mode 100755 transform_unicorn_flat_bills.pl delete mode 100755 transform_unicorn_flat_charges.pl delete mode 100755 unicorn_patron_xml2text.pl diff --git a/Migration Data Work HOWTO.txt b/Migration Data Work HOWTO.txt deleted file mode 100644 index 77147bb..0000000 --- a/Migration Data Work HOWTO.txt +++ /dev/null @@ -1,129 +0,0 @@ - Migration Data Work HOWTO / Toolkit -======================================================================== -The following is for migrating into an existing system like PINES: - -Get the incoming bib data, and translate to UTF-8 MARCXML. It may -contain holdings. It may contain XML or MARC errors that you have to -sanitize before your tools will work. This is one way to translate -MARC-8 MARC21 to UTF-8 MARCXML: - - yaz-marcdump -f MARC-8 -t UTF-8 -o marcxml \ - incoming.marc > incoming.marc.xml - -If you need to trim the bibs to a subset based on the presence of a -certain value in a specific tag/subfield (for example, if you have the -bibs for all libraries in a foreign system and only need bibs -belonging to a specific migrating library, you might filter based on -their holding tags) - - trim_marc_based_on_tag_subfield_value.pl 999 m BRANCH_CODE \ - incoming.marc.xml > incoming.filtered.marc.xml - -Embed potential native record ids into the incumbent records - - renumber_marc -rf 100000 -t 903 -s a -o incoming.renumbered.marc.xml \ - incoming.marc.xml - -Get primary fingerprints for incoming data and get a bib dump of -matching records from the incumbent system - - fingerprinter -r primary -t 903 -s a -o incoming.primary.fp \ - -x incoming.primary.ex incoming.renumbered.mrc.xml - -#Edit the query_for_primary_matching_incumbent_record.pl script to -#point to the correct Evergreen database and table holding the -#incumbent primary fingerprints (FIXME add in how to create such a -#table). -# -# query_for_primary_matching_incumbent_record.pl incoming.primary.fp \ -# | sort | uniq > primary_matching_incumbent.record_ids -# -#In a postgres shell, you create a temporary table to hold these id's: -# -# CREATE TABLE primary_matching_incumbent_records_for_incoming_library -# (id BIGINT); -# COPY primary_matching_incumbent_records_for_incoming_library -# FROM 'primary_matching_incumbent.record_ids'; -# -#To dump the matching incumbent records to a file, in a postgres shell -#do: -# -# matching_incumbent_records.dump SELECT b.id, b.tcn_source, b.tcn_value, -# regexp_replace(b.marc,E'\n','','g') -# FROM biblio.record_entry AS b -# JOIN primary_matching_incumbent_records_for_incoming_library -# AS c using ( id ); -# -#Now to turn that dump into a MARCXML file with record numbers and TCN -#embedded in tag 901, do: -# -# marc_add_ids -f id -f tcn_source -f tcn_value -f marc \ -# < matching_incumbent_records.dump > matching_incumbent_records.marc.xml -# -#It's possible that this file may need to be itself sanitized some. -#This will transform code=""" into code="&x0022;", for example: -# -# cat matching_incumbent_records.marc.xml | \ -# sed 's/code=\"\"\"/code=\"\"\"/' \ -# > matching_incumbent_records.escaped.mrc.xml - -Get full fingerprints for both datasets and match them. - - fingerprinter -r full -t 901 -s c -o incumbent.fp -x incumbent.ex \ - matching_incumbent_records.marc.xml - fingerprinter -r full -t 903 -s a -o incoming.fp -x incoming.ex \ - incoming.renumbered.marc.xml - -The script below will produce matched groupings, and can optionally -take a 4th and 5th parameter providing scoring information for -determining lead records. In the past, this would consider certain -metrics for MARC quality, but in the latest incarnation, it assumes an -incumbent record will be the lead record, and looks at # of holdings -and possible matching of tag 245 subfield b for determining which of -the incumbent records would be the lead record. The example -invocation below does not use scoring. - - match_fingerprints.pl "name of dataset for dedup interface" \ - incumbent.fp incoming.fp - -This will produce two files, match.groupings and match.record_ids.  -The format for match.groupings is suitable for insertion into the db -for the dedup interface. - -Import these matches and records into the legacy dedup interface for viewing: - -Now to tar up the specific MARC records involved for the dedup interface: - - cat match.groupings | cut -d^ -f3 > incumbent.record_ids - cat match.groupings | cut -d^ -f5 | cut -d, -f2- | sed 's/,/\n/g' \ - > incoming.record_ids - -# mkdir dataset ; cd dataset -# select_marc.pl ../incumbent.record_ids 901 c \ -# ../matching_incumbent_records.mrc.xml -# select_marc.pl ../incoming.record_ids 903 a \ -# ../incoming.renumbered.mrc.xml -# cd .. -# tar cvf dataset.tar dataset -# -#In a mysql shell for the database used with the dedup interface: -# -# LOAD DATA LOCAL INFILE 'match.groupings' INTO TABLE record_group -# FIELDS TERMINATED BY '^' -# ( status, dataset, best_record,records,original_records ); -# -#Create a pretty printed text dump of the non-matching incoming records: -# -# dump_inverse_select_marc.pl incoming.record_ids 903 a \ -# incoming.renumbered.mrc.xml > non_matching_incoming.mrc.txt 2> \ -# non_matching_incoming.mrc.txt.err - -marc2bre.pl --idfield=903 --dontuse=live_tcns.txt -f -quitman_non_matching_incoming.mrc.xml -f -catoosa_non_matching_incoming.mrc.xml --marctype=XML > some.bre - -direct_ingest.pl < some.bre > some.ingest - -perl pg_loader.pl -or bre -or mrd -or mfr -or mtfe -or mafe -or msfe --or mkfe -or msefe -a mrd -a mfr -a mtfe -a mafe -a msfe -a mkfe -a -msefe < ~/gutenberg.ingest > ~/gutenberg.sql diff --git a/clear-db.sql b/clear-db.sql deleted file mode 100644 index 1602265..0000000 --- a/clear-db.sql +++ /dev/null @@ -1,140 +0,0 @@ -SET CLIENT_ENCODING TO 'UNICODE'; - -BEGIN; - -SELECT reporter.disable_materialized_simple_record_trigger(); - -CREATE TEMP TABLE dummy_bib AS SELECT * FROM biblio.record_entry WHERE id = -1; -CREATE TEMP TABLE dummy_cn AS SELECT * FROM asset.call_number WHERE id = -1; - -CREATE TEMP TABLE backup_loc AS SELECT * FROM asset.copy_location WHERE id = 1; -CREATE TEMP TABLE backup_usr AS SELECT * FROM actor.usr WHERE id = 1; -CREATE TEMP TABLE backup_card AS SELECT * FROM actor.card WHERE usr IN (SELECT id FROM backup_usr); - -UPDATE backup_usr SET card = NULL; - -DROP INDEX metabib.metabib_full_rec_index_vector_idx; -DROP INDEX metabib.metabib_full_value_idx; -DROP INDEX metabib.metabib_title_field_entry_index_vector_idx; -DROP INDEX metabib.metabib_author_field_entry_index_vector_idx; -DROP INDEX metabib.metabib_subject_field_entry_index_vector_idx; -DROP INDEX metabib.metabib_keyword_field_entry_index_vector_idx; -DROP INDEX metabib.metabib_series_field_entry_index_vector_idx; -DROP INDEX metabib.metabib_full_rec_tag_subfield_idx; - - -TRUNCATE biblio.record_entry CASCADE; -TRUNCATE metabib.full_rec; -TRUNCATE metabib.rec_descriptor; -TRUNCATE metabib.title_field_entry; -TRUNCATE metabib.author_field_entry; -TRUNCATE metabib.subject_field_entry; -TRUNCATE metabib.keyword_field_entry; -TRUNCATE metabib.series_field_entry; -TRUNCATE auditor.biblio_record_entry_history; -TRUNCATE asset.copy_location CASCADE; - -TRUNCATE actor.usr CASCADE; -TRUNCATE actor.card CASCADE; -TRUNCATE actor.usr_address CASCADE; -TRUNCATE actor.stat_cat CASCADE; -TRUNCATE actor.stat_cat_entry_usr_map CASCADE; -TRUNCATE money.grocery CASCADE; -TRUNCATE money.billing CASCADE; -TRUNCATE action.circulation CASCADE; -TRUNCATE action.hold_request CASCADE; - -SELECT SETVAL('biblio.record_entry_id_seq', 1); - -SELECT SETVAL('money.billable_xact_id_seq', 1); -SELECT SETVAL('money.billing_id_seq', 1); - -SELECT SETVAL('action.hold_request_id_seq', 1); - -SELECT SETVAL('asset.call_number_id_seq', 2); -SELECT SETVAL('asset.copy_id_seq', 1); -SELECT SETVAL('asset.copy_location_id_seq', 2); -SELECT SETVAL('asset.stat_cat_id_seq', 1); -SELECT SETVAL('asset.stat_cat_entry_id_seq', 1); -SELECT SETVAL('asset.stat_cat_entry_copy_map_id_seq', 1); - -SELECT SETVAL('actor.usr_id_seq', 2); -SELECT SETVAL('actor.card_id_seq', 2); -SELECT SETVAL('actor.usr_address_id_seq', 2); -SELECT SETVAL('actor.stat_cat_id_seq', 1); -SELECT SETVAL('actor.stat_cat_entry_id_seq', 1); -SELECT SETVAL('actor.stat_cat_entry_usr_map_id_seq', 1); - -SELECT SETVAL('metabib.full_rec_id_seq', 1); -SELECT SETVAL('metabib.rec_descriptor_id_seq', 1); -SELECT SETVAL('metabib.title_field_entry_id_seq', 1); -SELECT SETVAL('metabib.author_field_entry_id_seq', 1); -SELECT SETVAL('metabib.subject_field_entry_id_seq', 1); -SELECT SETVAL('metabib.keyword_field_entry_id_seq', 1); -SELECT SETVAL('metabib.series_field_entry_id_seq', 1); -SELECT SETVAL('metabib.metarecord_id_seq', 1); -SELECT SETVAL('metabib.metarecord_source_map_id_seq', 1); - -INSERT INTO biblio.record_entry SELECT * FROM dummy_bib; -INSERT INTO asset.call_number SELECT * FROM dummy_cn; -INSERT INTO asset.copy_location SELECT * FROM backup_loc; -INSERT INTO actor.usr SELECT * FROM backup_usr; -INSERT INTO actor.card SELECT * FROM backup_card; -UPDATE actor.usr SET card = actor.card.id FROM actor.card WHERE actor.usr.id = actor.card.usr; -SELECT SETVAL('actor.usr_id_seq', (SELECT MAX(id)+1 FROM actor.usr)); -SELECT SETVAL('actor.card_id_seq', (SELECT MAX(id)+1 FROM actor.card)); - --- Put any scripts that reload bibs/items/etc here. Example included. -/* - -\i incumbent.sql -\i incoming.sql - -*/ -\i IN.sql - -CREATE INDEX metabib_title_field_entry_index_vector_idx ON metabib.title_field_entry USING GIST (index_vector); -CREATE INDEX metabib_author_field_entry_index_vector_idx ON metabib.author_field_entry USING GIST (index_vector); -CREATE INDEX metabib_subject_field_entry_index_vector_idx ON metabib.subject_field_entry USING GIST (index_vector); -CREATE INDEX metabib_keyword_field_entry_index_vector_idx ON metabib.keyword_field_entry USING GIST (index_vector); -CREATE INDEX metabib_series_field_entry_index_vector_idx ON metabib.series_field_entry USING GIST (index_vector); -CREATE INDEX metabib_full_rec_index_vector_idx ON metabib.full_rec USING GIST (index_vector); -CREATE INDEX metabib_full_rec_tag_subfield_idx ON metabib.full_rec (tag,subfield); -CREATE INDEX metabib_full_value_idx ON metabib.full_rec (value); - -/* Run the AFTER committing ... - -ALTER TABLE metabib.metarecord_source_map DROP CONSTRAINT metabib_metarecord_source_map_metarecord_fkey; - -TRUNCATE metabib.metarecord; -TRUNCATE metabib.metarecord_source_map; - -INSERT INTO metabib.metarecord (fingerprint,master_record) - SELECT fingerprint,id - FROM (SELECT DISTINCT ON (fingerprint) - fingerprint, id, quality - FROM biblio.record_entry - ORDER BY fingerprint, quality desc) AS x - WHERE fingerprint IS NOT NULL; - -INSERT INTO metabib.metarecord_source_map (metarecord,source) - SELECT m.id, b.id - FROM biblio.record_entry b - JOIN metabib.metarecord m ON (m.fingerprint = b.fingerprint); - -ALTER TABLE metabib.metarecord_source_map - ADD CONSTRAINT metabib_metarecord_source_map_metarecord_fkey - FOREIGN KEY (metarecord) REFERENCES metabib.metarecord (id) DEFERRABLE INITIALLY DEFERRED; - -*/ - - -/* And this too, if it's production - -SELECT reporter.enable_materialized_simple_record_trigger(); - -*/ - --- COMMIT; --- VACUUM FULL; - diff --git a/database/clear-db.sql b/database/clear-db.sql new file mode 100644 index 0000000..1602265 --- /dev/null +++ b/database/clear-db.sql @@ -0,0 +1,140 @@ +SET CLIENT_ENCODING TO 'UNICODE'; + +BEGIN; + +SELECT reporter.disable_materialized_simple_record_trigger(); + +CREATE TEMP TABLE dummy_bib AS SELECT * FROM biblio.record_entry WHERE id = -1; +CREATE TEMP TABLE dummy_cn AS SELECT * FROM asset.call_number WHERE id = -1; + +CREATE TEMP TABLE backup_loc AS SELECT * FROM asset.copy_location WHERE id = 1; +CREATE TEMP TABLE backup_usr AS SELECT * FROM actor.usr WHERE id = 1; +CREATE TEMP TABLE backup_card AS SELECT * FROM actor.card WHERE usr IN (SELECT id FROM backup_usr); + +UPDATE backup_usr SET card = NULL; + +DROP INDEX metabib.metabib_full_rec_index_vector_idx; +DROP INDEX metabib.metabib_full_value_idx; +DROP INDEX metabib.metabib_title_field_entry_index_vector_idx; +DROP INDEX metabib.metabib_author_field_entry_index_vector_idx; +DROP INDEX metabib.metabib_subject_field_entry_index_vector_idx; +DROP INDEX metabib.metabib_keyword_field_entry_index_vector_idx; +DROP INDEX metabib.metabib_series_field_entry_index_vector_idx; +DROP INDEX metabib.metabib_full_rec_tag_subfield_idx; + + +TRUNCATE biblio.record_entry CASCADE; +TRUNCATE metabib.full_rec; +TRUNCATE metabib.rec_descriptor; +TRUNCATE metabib.title_field_entry; +TRUNCATE metabib.author_field_entry; +TRUNCATE metabib.subject_field_entry; +TRUNCATE metabib.keyword_field_entry; +TRUNCATE metabib.series_field_entry; +TRUNCATE auditor.biblio_record_entry_history; +TRUNCATE asset.copy_location CASCADE; + +TRUNCATE actor.usr CASCADE; +TRUNCATE actor.card CASCADE; +TRUNCATE actor.usr_address CASCADE; +TRUNCATE actor.stat_cat CASCADE; +TRUNCATE actor.stat_cat_entry_usr_map CASCADE; +TRUNCATE money.grocery CASCADE; +TRUNCATE money.billing CASCADE; +TRUNCATE action.circulation CASCADE; +TRUNCATE action.hold_request CASCADE; + +SELECT SETVAL('biblio.record_entry_id_seq', 1); + +SELECT SETVAL('money.billable_xact_id_seq', 1); +SELECT SETVAL('money.billing_id_seq', 1); + +SELECT SETVAL('action.hold_request_id_seq', 1); + +SELECT SETVAL('asset.call_number_id_seq', 2); +SELECT SETVAL('asset.copy_id_seq', 1); +SELECT SETVAL('asset.copy_location_id_seq', 2); +SELECT SETVAL('asset.stat_cat_id_seq', 1); +SELECT SETVAL('asset.stat_cat_entry_id_seq', 1); +SELECT SETVAL('asset.stat_cat_entry_copy_map_id_seq', 1); + +SELECT SETVAL('actor.usr_id_seq', 2); +SELECT SETVAL('actor.card_id_seq', 2); +SELECT SETVAL('actor.usr_address_id_seq', 2); +SELECT SETVAL('actor.stat_cat_id_seq', 1); +SELECT SETVAL('actor.stat_cat_entry_id_seq', 1); +SELECT SETVAL('actor.stat_cat_entry_usr_map_id_seq', 1); + +SELECT SETVAL('metabib.full_rec_id_seq', 1); +SELECT SETVAL('metabib.rec_descriptor_id_seq', 1); +SELECT SETVAL('metabib.title_field_entry_id_seq', 1); +SELECT SETVAL('metabib.author_field_entry_id_seq', 1); +SELECT SETVAL('metabib.subject_field_entry_id_seq', 1); +SELECT SETVAL('metabib.keyword_field_entry_id_seq', 1); +SELECT SETVAL('metabib.series_field_entry_id_seq', 1); +SELECT SETVAL('metabib.metarecord_id_seq', 1); +SELECT SETVAL('metabib.metarecord_source_map_id_seq', 1); + +INSERT INTO biblio.record_entry SELECT * FROM dummy_bib; +INSERT INTO asset.call_number SELECT * FROM dummy_cn; +INSERT INTO asset.copy_location SELECT * FROM backup_loc; +INSERT INTO actor.usr SELECT * FROM backup_usr; +INSERT INTO actor.card SELECT * FROM backup_card; +UPDATE actor.usr SET card = actor.card.id FROM actor.card WHERE actor.usr.id = actor.card.usr; +SELECT SETVAL('actor.usr_id_seq', (SELECT MAX(id)+1 FROM actor.usr)); +SELECT SETVAL('actor.card_id_seq', (SELECT MAX(id)+1 FROM actor.card)); + +-- Put any scripts that reload bibs/items/etc here. Example included. +/* + +\i incumbent.sql +\i incoming.sql + +*/ +\i IN.sql + +CREATE INDEX metabib_title_field_entry_index_vector_idx ON metabib.title_field_entry USING GIST (index_vector); +CREATE INDEX metabib_author_field_entry_index_vector_idx ON metabib.author_field_entry USING GIST (index_vector); +CREATE INDEX metabib_subject_field_entry_index_vector_idx ON metabib.subject_field_entry USING GIST (index_vector); +CREATE INDEX metabib_keyword_field_entry_index_vector_idx ON metabib.keyword_field_entry USING GIST (index_vector); +CREATE INDEX metabib_series_field_entry_index_vector_idx ON metabib.series_field_entry USING GIST (index_vector); +CREATE INDEX metabib_full_rec_index_vector_idx ON metabib.full_rec USING GIST (index_vector); +CREATE INDEX metabib_full_rec_tag_subfield_idx ON metabib.full_rec (tag,subfield); +CREATE INDEX metabib_full_value_idx ON metabib.full_rec (value); + +/* Run the AFTER committing ... + +ALTER TABLE metabib.metarecord_source_map DROP CONSTRAINT metabib_metarecord_source_map_metarecord_fkey; + +TRUNCATE metabib.metarecord; +TRUNCATE metabib.metarecord_source_map; + +INSERT INTO metabib.metarecord (fingerprint,master_record) + SELECT fingerprint,id + FROM (SELECT DISTINCT ON (fingerprint) + fingerprint, id, quality + FROM biblio.record_entry + ORDER BY fingerprint, quality desc) AS x + WHERE fingerprint IS NOT NULL; + +INSERT INTO metabib.metarecord_source_map (metarecord,source) + SELECT m.id, b.id + FROM biblio.record_entry b + JOIN metabib.metarecord m ON (m.fingerprint = b.fingerprint); + +ALTER TABLE metabib.metarecord_source_map + ADD CONSTRAINT metabib_metarecord_source_map_metarecord_fkey + FOREIGN KEY (metarecord) REFERENCES metabib.metarecord (id) DEFERRABLE INITIALLY DEFERRED; + +*/ + + +/* And this too, if it's production + +SELECT reporter.enable_materialized_simple_record_trigger(); + +*/ + +-- COMMIT; +-- VACUUM FULL; + diff --git a/database/final.load-bib.sql b/database/final.load-bib.sql new file mode 100644 index 0000000..b2108e4 --- /dev/null +++ b/database/final.load-bib.sql @@ -0,0 +1,80 @@ +SET CLIENT_ENCODING TO 'UNICODE'; + +BEGIN; + +SELECT reporter.disable_materialized_simple_record_trigger(); + +CREATE TEMP TABLE dummy_bib AS SELECT * FROM biblio.record_entry WHERE id = -1; +CREATE TEMP TABLE dummy_cn AS SELECT * FROM asset.call_number WHERE id = -1; + +DROP INDEX metabib.metabib_full_rec_index_vector_idx; +DROP INDEX metabib.metabib_title_field_entry_index_vector_idx; +DROP INDEX metabib.metabib_author_field_entry_index_vector_idx; +DROP INDEX metabib.metabib_subject_field_entry_index_vector_idx; +DROP INDEX metabib.metabib_keyword_field_entry_index_vector_idx; +DROP INDEX metabib.metabib_series_field_entry_index_vector_idx; + +TRUNCATE biblio.record_entry CASCADE; +TRUNCATE metabib.full_rec; +TRUNCATE metabib.rec_descriptor; +TRUNCATE metabib.title_field_entry; +TRUNCATE metabib.author_field_entry; +TRUNCATE metabib.subject_field_entry; +TRUNCATE metabib.keyword_field_entry; +TRUNCATE metabib.series_field_entry; +TRUNCATE auditor.biblio_record_entry_history; + +SELECT SETVAL('biblio.record_entry_id_seq', 1); +SELECT SETVAL('metabib.full_rec_id_seq', 1); +SELECT SETVAL('metabib.rec_descriptor_id_seq', 1); +SELECT SETVAL('metabib.title_field_entry_id_seq', 1); +SELECT SETVAL('metabib.author_field_entry_id_seq', 1); +SELECT SETVAL('metabib.subject_field_entry_id_seq', 1); +SELECT SETVAL('metabib.keyword_field_entry_id_seq', 1); +SELECT SETVAL('metabib.series_field_entry_id_seq', 1); +SELECT SETVAL('metabib.metarecord_id_seq', 1); +SELECT SETVAL('metabib.metarecord_source_map_id_seq', 1); + +INSERT INTO biblio.record_entry SELECT * FROM dummy_bib; +INSERT INTO asset.call_number SELECT * FROM dummy_cn; + +\i incumbent.sql +\i incoming.sql + +CREATE INDEX metabib_title_field_entry_index_vector_idx ON metabib.title_field_entry USING GIST (index_vector); +CREATE INDEX metabib_author_field_entry_index_vector_idx ON metabib.author_field_entry USING GIST (index_vector); +CREATE INDEX metabib_subject_field_entry_index_vector_idx ON metabib.subject_field_entry USING GIST (index_vector); +CREATE INDEX metabib_keyword_field_entry_index_vector_idx ON metabib.keyword_field_entry USING GIST (index_vector); +CREATE INDEX metabib_series_field_entry_index_vector_idx ON metabib.series_field_entry USING GIST (index_vector); +CREATE INDEX metabib_full_rec_index_vector_idx ON metabib.full_rec USING GIST (index_vector); +CREATE INDEX metabib_full_rec_tag_subfield_idx ON metabib.full_rec (tag,subfield); +CREATE INDEX metabib_full_value_idx ON metabib.full_rec (value); + +ALTER TABLE metabib.metarecord_source_map DROP CONSTRAINT metabib_metarecord_source_map_metarecord_fkey; + +TRUNCATE metabib.metarecord; +TRUNCATE metabib.metarecord_source_map; + +INSERT INTO metabib.metarecord (fingerprint,master_record) + SELECT fingerprint,id + FROM (SELECT DISTINCT ON (fingerprint) + fingerprint, id, quality + FROM biblio.record_entry + ORDER BY fingerprint, quality desc) AS x + WHERE fingerprint IS NOT NULL; + +INSERT INTO metabib.metarecord_source_map (metarecord,source) + SELECT m.id, b.id + FROM biblio.record_entry b + JOIN metabib.metarecord m ON (m.fingerprint = b.fingerprint); + +ALTER TABLE metabib.metarecord_source_map + ADD CONSTRAINT metabib_metarecord_source_map_metarecord_fkey + FOREIGN KEY (metarecord) REFERENCES metabib.metarecord (id) DEFERRABLE INITIALLY DEFERRED; + +SELECT reporter.enable_materialized_simple_record_trigger(); + + +-- COMMIT; +-- VACUUM FULL; + diff --git a/final.load-bib.sql b/final.load-bib.sql deleted file mode 100644 index b2108e4..0000000 --- a/final.load-bib.sql +++ /dev/null @@ -1,80 +0,0 @@ -SET CLIENT_ENCODING TO 'UNICODE'; - -BEGIN; - -SELECT reporter.disable_materialized_simple_record_trigger(); - -CREATE TEMP TABLE dummy_bib AS SELECT * FROM biblio.record_entry WHERE id = -1; -CREATE TEMP TABLE dummy_cn AS SELECT * FROM asset.call_number WHERE id = -1; - -DROP INDEX metabib.metabib_full_rec_index_vector_idx; -DROP INDEX metabib.metabib_title_field_entry_index_vector_idx; -DROP INDEX metabib.metabib_author_field_entry_index_vector_idx; -DROP INDEX metabib.metabib_subject_field_entry_index_vector_idx; -DROP INDEX metabib.metabib_keyword_field_entry_index_vector_idx; -DROP INDEX metabib.metabib_series_field_entry_index_vector_idx; - -TRUNCATE biblio.record_entry CASCADE; -TRUNCATE metabib.full_rec; -TRUNCATE metabib.rec_descriptor; -TRUNCATE metabib.title_field_entry; -TRUNCATE metabib.author_field_entry; -TRUNCATE metabib.subject_field_entry; -TRUNCATE metabib.keyword_field_entry; -TRUNCATE metabib.series_field_entry; -TRUNCATE auditor.biblio_record_entry_history; - -SELECT SETVAL('biblio.record_entry_id_seq', 1); -SELECT SETVAL('metabib.full_rec_id_seq', 1); -SELECT SETVAL('metabib.rec_descriptor_id_seq', 1); -SELECT SETVAL('metabib.title_field_entry_id_seq', 1); -SELECT SETVAL('metabib.author_field_entry_id_seq', 1); -SELECT SETVAL('metabib.subject_field_entry_id_seq', 1); -SELECT SETVAL('metabib.keyword_field_entry_id_seq', 1); -SELECT SETVAL('metabib.series_field_entry_id_seq', 1); -SELECT SETVAL('metabib.metarecord_id_seq', 1); -SELECT SETVAL('metabib.metarecord_source_map_id_seq', 1); - -INSERT INTO biblio.record_entry SELECT * FROM dummy_bib; -INSERT INTO asset.call_number SELECT * FROM dummy_cn; - -\i incumbent.sql -\i incoming.sql - -CREATE INDEX metabib_title_field_entry_index_vector_idx ON metabib.title_field_entry USING GIST (index_vector); -CREATE INDEX metabib_author_field_entry_index_vector_idx ON metabib.author_field_entry USING GIST (index_vector); -CREATE INDEX metabib_subject_field_entry_index_vector_idx ON metabib.subject_field_entry USING GIST (index_vector); -CREATE INDEX metabib_keyword_field_entry_index_vector_idx ON metabib.keyword_field_entry USING GIST (index_vector); -CREATE INDEX metabib_series_field_entry_index_vector_idx ON metabib.series_field_entry USING GIST (index_vector); -CREATE INDEX metabib_full_rec_index_vector_idx ON metabib.full_rec USING GIST (index_vector); -CREATE INDEX metabib_full_rec_tag_subfield_idx ON metabib.full_rec (tag,subfield); -CREATE INDEX metabib_full_value_idx ON metabib.full_rec (value); - -ALTER TABLE metabib.metarecord_source_map DROP CONSTRAINT metabib_metarecord_source_map_metarecord_fkey; - -TRUNCATE metabib.metarecord; -TRUNCATE metabib.metarecord_source_map; - -INSERT INTO metabib.metarecord (fingerprint,master_record) - SELECT fingerprint,id - FROM (SELECT DISTINCT ON (fingerprint) - fingerprint, id, quality - FROM biblio.record_entry - ORDER BY fingerprint, quality desc) AS x - WHERE fingerprint IS NOT NULL; - -INSERT INTO metabib.metarecord_source_map (metarecord,source) - SELECT m.id, b.id - FROM biblio.record_entry b - JOIN metabib.metarecord m ON (m.fingerprint = b.fingerprint); - -ALTER TABLE metabib.metarecord_source_map - ADD CONSTRAINT metabib_metarecord_source_map_metarecord_fkey - FOREIGN KEY (metarecord) REFERENCES metabib.metarecord (id) DEFERRABLE INITIALLY DEFERRED; - -SELECT reporter.enable_materialized_simple_record_trigger(); - - --- COMMIT; --- VACUUM FULL; - diff --git a/notes.txt b/notes.txt deleted file mode 100644 index 0bd0a2c..0000000 --- a/notes.txt +++ /dev/null @@ -1,5 +0,0 @@ -# One-linerize an XML (so that it can be split with 'split -l', for example) -perl -e 'my $line="";while(<>){chomp;$line.=$_;if($_ eq ""){$line=~s/>\s+ pines-marca$i.xml; done diff --git a/split_unicorn_marc_holdings.pl b/split_unicorn_marc_holdings.pl deleted file mode 100755 index c99d1a1..0000000 --- a/split_unicorn_marc_holdings.pl +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/perl -use open ':utf8'; -use MARC::Batch; -use MARC::File::XML ( BinaryEncoding => 'utf-8' ); -use MARC::Field; -use Unicode::Normalize; - -my $count = 0; - -binmode(STDOUT, ':utf8'); -binmode(STDIN, ':utf8'); - -print join("\t", - "bib id", - "library", - "barcode", - "current location", - "home location", - "call number", - "item type", - "acq date", - "price", - "circulate flag", - "total charges", - "cat1", - "cat2" -) . "\n"; - -foreach my $argnum ( 0 .. $#ARGV ) { - - print STDERR "Processing " . $ARGV[$argnum] . "\n"; - - my $M; - open $M, '<:utf8', $ARGV[$argnum]; - my $batch = MARC::Batch->new('XML',$M); - - $batch->strict_off(); - $batch->warnings_off(); - - while ( my $record = $batch->next() ) { - - $count++; - - print STDERR "WARNINGS: Record $count : " . join(":",@warnings) . " : continuing...\n" if ( @warnings ); - my $my_903a = $record->field('903')->subfield('a'); # target bib id's here - my @tags = $record->field('999'); - foreach my $tag ( @tags ) { - print join("\t", - $my_903a, - $tag->subfield('m') || '', # library - $tag->subfield('i') || '', # barcode - $tag->subfield('k') || '', # current location - $tag->subfield('l') || '', # home location - $tag->subfield('a') || '', # call number - $tag->subfield('t') || '', # item type - $tag->subfield('u') || '', # acq date - $tag->subfield('p') || '', # price - $tag->subfield('r') || '', # circulate flag - $tag->subfield('n') || '', # total charges - $tag->subfield('x') || '', # cat1 - $tag->subfield('z') || '' # cat2 - ) . "\n"; - } - - } - print STDERR "Processed $count records\n"; -} diff --git a/transform_unicorn_flat_bills.pl b/transform_unicorn_flat_bills.pl deleted file mode 100755 index a754bff..0000000 --- a/transform_unicorn_flat_bills.pl +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/perl -w - -my $count = 0; -my %records = (); - -sub print_line { - print join("\t", - $records{ $count }{'FORM'} || '', - $records{ $count }{'USER_ID'} || '', - $records{ $count }{'ITEM_ID'} || '', - $records{ $count }{'BILL_LIBRARY'} || '', - $records{ $count }{'BILL_DB'} || '', - $records{ $count }{'BILL_AMOUNT'} || '', - $records{ $count }{'BILL_REASON'} || '', - ) . "\n"; -} - -print "FORM\tUSER_ID\tITEM_ID\tBILL_LIBRARY\tBILL_DB\tBILL_AMOUNT\tBILL_REASON\n"; - -while (my $line = <>) { - chomp $line; $line =~ s/[\r\n]//g; - if ($line =~ /DOCUMENT BOUNDARY/) { - if (defined $records{ $count }) { - print_line(); - } - $count++; $records{ $count } = {}; - } - if ($line =~ /FORM=(.+)/) { - $records{ $count }{'FORM'} = $1; - } - if ($line =~ /\.USER_ID\..+\|a(.+)/) { - $records{ $count }{'USER_ID'} = $1; - } - if ($line =~ /\.ITEM_ID\..+\|a(.+)/) { - $records{ $count }{'ITEM_ID'} = $1; - } - if ($line =~ /\.BILL_LIBRARY\..+\|a(.+)/) { - $records{ $count }{'BILL_LIBRARY'} = $1; - } - if ($line =~ /\.BILL_DB\..+\|a(.+)/) { - $records{ $count }{'BILL_DB'} = $1; - } - if ($line =~ /\.BILL_AMOUNT\..+\|a(.+)/) { - $records{ $count }{'BILL_AMOUNT'} = $1; - } - if ($line =~ /\.BILL_REASON\..+\|a(.+)/) { - $records{ $count }{'BILL_REASON'} = $1; - } -} -print_line(); - diff --git a/transform_unicorn_flat_charges.pl b/transform_unicorn_flat_charges.pl deleted file mode 100755 index 9f53e87..0000000 --- a/transform_unicorn_flat_charges.pl +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/perl -w - -my $count = 0; -my %records = (); - -sub print_line { - print join("\t", - $records{ $count }{'FORM'} || '', - $records{ $count }{'USER_ID'} || '', - $records{ $count }{'ITEM_ID'} || '', - $records{ $count }{'CHRG_LIBRARY'} || '', - $records{ $count }{'CHRG_DC'} || '', - $records{ $count }{'CHRG_DATEDUE'} || '', - $records{ $count }{'CHRG_DATE_CLMRET'} || '', - ) . "\n"; -} - -print "FORM\tUSER_ID\tITEM_ID\tCHRG_LIBRARY\tCHRG_DC\tCHRG_DATEDUE\tCHRG_DATE_CLMRET\n"; - -while (my $line = <>) { - chomp $line; $line =~ s/[\r\n]//g; - if ($line =~ /DOCUMENT BOUNDARY/) { - if (defined $records{ $count }) { - print_line(); - } - $count++; $records{ $count } = {}; - } - if ($line =~ /FORM=(.+)/) { - $records{ $count }{'FORM'} = $1; - } - if ($line =~ /\.USER_ID\..+\|a(.+)/) { - $records{ $count }{'USER_ID'} = $1; - } - if ($line =~ /\.ITEM_ID\..+\|a(.+)/) { - $records{ $count }{'ITEM_ID'} = $1; - } - if ($line =~ /\.CHRG_LIBRARY\..+\|a(.+)/) { - $records{ $count }{'CHRG_LIBRARY'} = $1; - } - if ($line =~ /\.CHRG_DC\..+\|a(.+)/) { - $records{ $count }{'CHRG_DC'} = $1; - } - if ($line =~ /\.CHRG_DATEDUE\..+\|a(.+)/) { - $records{ $count }{'CHRG_DATEDUE'} = $1; - } - if ($line =~ /\.CHRG_DATE_CLMRET\..+\|a(.+)/) { - $records{ $count }{'CHRG_DATE_CLMRET'} = $1; - } -} -print_line(); - diff --git a/unicorn_patron_xml2text.pl b/unicorn_patron_xml2text.pl deleted file mode 100755 index 97149c8..0000000 --- a/unicorn_patron_xml2text.pl +++ /dev/null @@ -1,213 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; - -use DateTime; -use Time::HiRes qw/time/; -use XML::LibXML; - -my %s_map; - -my $doc = XML::LibXML->new->parse_file($ARGV[0]); - -my $starttime = time; -my $count = 1; - -my @base_elements = ( - "user_id", - "user_altid", - "user_pin", - "user_profile", - "user_status", - "user_library", - "user_priv_granted", - "user_priv_expires", - "user_mailingaddr", - "birthdate", - "last_name", - "first_name", - "middle_name", - "suffix_name", - "note", - "note1", - "comment", - "staff", - "webcatpref", - "user_category1", - "user_category2", - "user_category3", - "user_category4", - "dept", - "guardian", - "license", - "aup", - "photo", - "notify_via", - "user_claims_ret", - "user_environment", - "user_department", - "ums_id" -); - -my @addr_elements = ( - "std_line1", - "std_line2", - "std_city", - "std_state", - "std_zip", - "phone", - "dayphone", - "homephone", - "workphone", - "cellphone", - "email", - "location", - "usefor", - "care_of", - "known_bad" -); - -print STDOUT join("\t", @base_elements); -foreach my $addr ( 1..3 ) { - print STDOUT "\t" . join("\t", @addr_elements); -} -print STDOUT "\tuserid_active\tinactive_barcode1\tinactive_barcode2"; -print STDOUT "\n"; - -for my $patron ( $doc->documentElement->childNodes ) { - next if ($patron->nodeType == 3); - - my $bc = $patron->findvalue( 'user_id' ); $bc =~ s/^\s+//; $bc =~ s/\s+$//; - if (exists($s_map{$bc})) { - $count++; - warn "\n!!! already saw barcode $bc, skipping\n"; - next; - } else { - $s_map{$bc} = 1; - } - - unless (defined($bc)) { - my $xml = $patron->toString; - warn "\n!!! no barcode found in UMS data, user number $count, xml => $xml \n"; - $count++; - next; - } - - foreach my $e ( @base_elements ) { - my $v = $patron->findvalue( $e ); $v =~ s/^\s+//; $v =~ s/\s+$//; - if ( $v && ( $e eq 'birthdate' || $e eq 'user_priv_granted' || $e eq 'user_priv_expires' ) ) { $v = parse_date($v); } - print STDOUT ( $v ? $v : '' ) . "\t"; - } - - my %addresses; - - for my $addr ( $patron->findnodes( "Address" ) ) { - my $addr_type = $addr->getAttribute('addr_type'); - $addresses{$addr_type} = $addr; - } - - foreach my $t ( 1..3 ) { - if ($addresses{$t}) { - foreach my $e ( @addr_elements ) { - my $v; - if ($e eq "known_bad") { - $v = $addresses{$t}->getAttribute( $e ); if ($v) { $v =~ s/^\s+//; $v =~ s/\s+$//; } - } else { - $v = $addresses{$t}->findvalue( $e ); $v =~ s/^\s+//; $v =~ s/\s+$//; - } - print STDOUT ( $v ? $v : '' ) . "\t"; - } - } else { - foreach ( @addr_elements ) { print STDOUT "\t"; } - } - } - - my $inactive_barcode1 = ''; - my $inactive_barcode2 = ''; - my $userid_active = 't'; - my @barcodes = $patron->findnodes( "barcodes" ); - for my $i_bc ( $barcodes[0]->findnodes( "barcode" ) ) { - my $active = $i_bc->getAttribute('active'); - if ($active eq "0" && $i_bc->textContent eq $bc) { - $userid_active = 'f'; - } - if ($active eq "0" && $i_bc->textContent ne $bc) { - if (! $inactive_barcode1 ) { - $inactive_barcode1 = $i_bc->textContent; - $inactive_barcode1 =~ s/^\s+//; - $inactive_barcode1 =~ s/\s+$//; - } else { - if (! $inactive_barcode2 ) { - $inactive_barcode2 = $i_bc->textContent; - $inactive_barcode2 =~ s/^\s+//; - $inactive_barcode2 =~ s/\s+$//; - } else { - warn "Extra barcode (" . $i_bc->textContent . ") for user with id = " . $bc . "\n"; - } - } - } - } - print STDOUT "$userid_active\t$inactive_barcode1\t$inactive_barcode2"; - - print STDOUT "\n"; - $count++; -} - -sub parse_date { - my $string = shift; - my $group = shift; - - my ($y,$m,$d); - - if ($string eq 'NEVER') { - my (undef,undef,undef,$d,$m,$y) = localtime(); - return sprintf('%04d-%02d-%02d', $y + 1920, $m + 1, $d); - } elsif (length($string) == 8 && $string =~ /^(\d{4})(\d{2})(\d{2})$/o) { - ($y,$m,$d) = ($1,$2,$3); - } elsif ($string =~ /(\d+)\D(\d+)\D(\d+)/o) { #looks like it's parsable - if ( length($3) > 2 ) { # looks like mm.dd.yyyy - if ( $1 < 99 && $2 < 99 && $1 > 0 && $2 > 0 && $3 > 0) { - if ($1 > 12 && $1 < 31 && $2 < 13) { # well, actually it looks like dd.mm.yyyy - ($y,$m,$d) = ($3,$2,$1); - } elsif ($2 > 12 && $2 < 31 && $1 < 13) { - ($y,$m,$d) = ($3,$1,$2); - } - } - } elsif ( length($1) > 3 ) { # format probably yyyy.mm.dd - if ( $3 < 99 && $2 < 99 && $1 > 0 && $2 > 0 && $3 > 0) { - if ($2 > 12 && $2 < 32 && $3 < 13) { # well, actually it looks like yyyy.dd.mm -- why, I don't konw - ($y,$m,$d) = ($1,$3,$2); - } elsif ($3 > 12 && $3 < 31 && $2 < 13) { - ($y,$m,$d) = ($1,$2,$3); - } - } - } elsif ( $1 < 99 && $2 < 99 && $3 < 99 && $1 > 0 && $2 > 0 && $3 > 0) { - if ($3 < 7) { # probably 2000 or greater, mm.dd.yy - $y = $3 + 2000; - if ($1 > 12 && $1 < 32 && $2 < 13) { # well, actually it looks like dd.mm.yyyy - ($m,$d) = ($2,$1); - } elsif ($2 > 12 && $2 < 32 && $1 < 13) { - ($m,$d) = ($1,$2); - } - } else { # probably before 2000, mm.dd.yy - $y = $3 + 1900; - if ($1 > 12 && $1 < 32 && $2 < 13) { # well, actually it looks like dd.mm.yyyy - ($m,$d) = ($2,$1); - } elsif ($2 > 12 && $2 < 32 && $1 < 13) { - ($m,$d) = ($1,$2); - } - } - } - } - - my $date = $string; - if ($y && $m && $d) { - eval { - $date = sprintf('%04d-%02d-%-2d',$y, $m, $d) - if (new DateTime ( year => $y, month => $m, day => $d )); - } - } - - return $date; -} - -- 1.7.2.5