From 3061a223b78cd95f61cda81d49e3fa4b74d439b6 Mon Sep 17 00:00:00 2001 From: Ben Ostrowsky Date: Fri, 10 Sep 2010 17:48:17 +0000 Subject: [PATCH] Generates a TSV with egid and MARCXML snippets to preserve for post-deduping reinsertion (insert_tags). --- extract_xml_tags.pl | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 42 insertions(+), 0 deletions(-) create mode 100755 extract_xml_tags.pl diff --git a/extract_xml_tags.pl b/extract_xml_tags.pl new file mode 100755 index 0000000..404e3af --- /dev/null +++ b/extract_xml_tags.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl -w +use strict; + +use Getopt::Long; + +my (@tags, $infile); +GetOptions ("tags=s" => \@tags, + "infile=s" => \$infile); +@tags = split(/,/, join(',', @tags)); + +open(FH, $infile) or die "Can't open $infile for reading: $!"; + +while () { + + my %tag; + my $xml = $_; + + # Find the Evergreen bib ID + $xml =~ m/(.+?)<\/subfield>/; + my $egid = $1; + + # Find each occurrence of each tag specified + foreach (@tags) { + $tag{$_} = [ $xml =~ m/()/g ]; + } + + # Clean up the results before printing + my $output = ''; + foreach my $key (sort keys %tag) { + my $text = join("", @{$tag{$key}}); + $text =~ s/>\s+