From ce9ac5aeee49207bc0adf3b021db0f33742feddd Mon Sep 17 00:00:00 2001 From: Lebbeous Fogle-Weekley Date: Fri, 20 Jul 2012 10:34:48 -0400 Subject: [PATCH] Very rough tool to help parse serctl.data from a Symphony export Signed-off-by: Lebbeous Fogle-Weekley --- symphony/serials/parser-serctrl.pl | 319 ++++++++++++++++++++++++++++++++++++ 1 files changed, 319 insertions(+), 0 deletions(-) create mode 100644 symphony/serials/parser-serctrl.pl diff --git a/symphony/serials/parser-serctrl.pl b/symphony/serials/parser-serctrl.pl new file mode 100644 index 0000000..baa5360 --- /dev/null +++ b/symphony/serials/parser-serctrl.pl @@ -0,0 +1,319 @@ +#!/usr/bin/perl + +# Copyright 2009-2012, Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +require 5.10.0; + +# This is still very rough. + +use Getopt::Std; +use Data::Dumper; +use MARC::Field; +use JSON::XS; + +use utf8; + +# order of field separators, i think: ! ; : . + +# pubcycle_to_scap() was derived at late hours, and isn't totally right. It's +# also not commented yet as you can see. Sorry, but this still needs lots of +# work to be reliable and understandable. +# +# $record is one of the objects loaded from serctl.data. $field is a +# MARC::Field object. +sub pubcycle_to_scap { + my ($record, $field) = @_; + + if ($record->{NAM_TYPE} ne 'NUMERATION') { + $field->update(i => '(year)'); + $field->update(x => '01'); + + if ($record->{NAM_TYPE} =~ /SEASON/) { + $field->update(j => '(season)'); + } elsif ($record->{NAM_TYPE} =~ /MONTH|DATE/) { + $field->update(j => '(month)'); + + $field->update(k => '(day)') if $record->{NAM_TYPE} eq 'DATE'; + } + } + + my @pfields = split /\!/, $record->{PUBCYCLE_DEF}; + + my @dow = qw/su mo tu we th fr sa/; + my @yf = qw/X a g h/; + my %periods_mo = ( 1 => 'm', 2 => 'b', 3 => 'q', 6 => 'f' ); + + if (@pfields == 2) { + if ($pfields[0] =~ /^(\d):M$/ and $pfields[1] =~ /^\d+:D$/ and $record->{NAM_TYPE} =~ /MONTH|CUSTOM|SEASON/) { + $pfields[0] =~ /^(\d):M$/; + my $freq_m = $1; + $pfields[1] =~ /^(\d+):D$/; + my $on = $1; + return 0 unless exists $periods_mo{$freq_m}; # not handled + $field->update( + w => $periods_mo{$freq_m}, y => sprintf('pd%02d', $on) + ); + return 1; # early out + } + elsif ($pfields[0] =~ /^[12]:W$/ and $pfields[1] =~ /^\d+:D$/) { + $pfields[0] =~ /^([12]):W$/; + my $freq_w = $1; + $pfields[1] =~ /^(\d+):D$/; + my $on = $1; + $field->update( + w => ($freq_m == 1 ? 'w' : 'e'), y => sprintf('pd%s', $dow[$on-1]) + ); + return 1; # early out + } + elsif ($pfields[0] =~ /^[1-3]:Y$/ and $pfields[1] =~ /^\d+:D\.\d+:M$/) { + $pfields[0] =~ /^([1-3]):Y$/; + my $freq_y = $1; + $pfields[1] =~ /^(\d+):D\.(\d+):M$/; + my $on_day = $1; + my $of_mo = $2; + $field->update( + w => $yf[$freq_y], + y => sprintf('pd%02d%02d', $of_mo, $on_day) + ); + return 1; # early out + } + } elsif (@pfields == 3) { + if ($pfields[0] =~ /^\d:M$/ && $pfields[1] =~ /^\d+:D(;|$)/) { + $pfields[0] =~ /^(\d):M$/; + my $months = $1; + + my @dates; + foreach my $date (split /;/, $pfields[1]) { + $date =~ /^(\d+):D/ or return 0; # not handled + push @dates, sprintf("%02d", $1); + } + my $potential = 12 / $months * @dates; + my @combos = split /;/, $pfields[2]; + my @cparts, @oparts; + foreach my $combo (@combos) { + if ($combo =~ /^(\d+):M\.0:Y/) { + my $squash = $1; + push @cparts, sprintf("%02d/%02d", $squash - 1, $squash); + } elsif ($combo =~ /(\d+):D\.(\d+):M\.0:Y/) { + push @oparts, sprintf("%02d%02d", $2, $1); + } else { + return 0; # abort, not handled yet + } + } + my $u; + $u = $potential - @cparts if @cparts; + $u = $potential - @oparts if @oparts; + + my $w; + if ($months == 1) { + $w = $periods_mo{$months}; + $w = 's' if $u > 12; + } elsif (exists $periods_mo{$months}) { + $w = $periods_mo{$months}; + } else { + return 0; # abort, not handled + } + $field->update(u => $u, v => 'r', w => $w); + $field->add_subfields(y => 'cm' . join(",", @cparts)) if @cparts; + $field->add_subfields(y => 'od' . join(",", @oparts)) if @oparts; + $field->add_subfields(y => 'pd' . join(",", @dates)); + return 1; # early out + } + } + + return 0; # fail +} + +sub record_bits_to_scap { + my ($record) = @_; + + # set up constants + my $field = new MARC::Field( + '853', # tag doesn't really matter here + 2 => '0', + 8 => '1', + i => '(year)' + ); + + # when nam_type is true, use sub_iss for numeric $w + $field->update(a => $record->{SERC_LBL1}) if $record->{SERC_LBL1}; + if ($record->{SERC_LBL2}) { + $field->update(b => $record->{SERC_LBL2}); + $field->update(u => $record->{SERC_LMT2} || 'var'); + $field->update(v => 'r'); + } + + # return the representation we need + return (new JSON::XS)->encode([ + $field->indicator(1), + $field->indicator(2), + map { @$_ } $field->subfields + ]) if pubcycle_to_scap($record, $field); + + return "null"; +} + +# actually parses serctl.data, the file with all the DOCUMENT BOUNDARY and +# other stuff +sub load_serctl_export { + my ($filename) = @_; + + open FH, "<$filename" or die ("can't read $filename: $!"); + + my $entries = []; + my $entry; + while () { + chomp; + + # If we don't match this regex, move to next entry. + if (not /^\.(\w+)\.(?:.+\|a(.+))?$/) { + push @$entries, $entry if $entry; + $entry = {}; + next; + } + + # If we don't have a defined $2, just move to next line. + next unless defined $2; + + $entry->{$1} = $2; + } + close FH; + + return $entries; +} + +sub unique_keys { + my ($data) = @_; + + my $small = {}; + foreach my $hash (@$data) { + foreach (keys %$hash) { + $small->{$_} ||= 0; + $small->{$_}++; + } + } + + return $small; +} + +# The title key map is a simple text file made up of lines. Each line contains +# two tokens separated by a space. The first one is the value of +# SERC_TITLE_KEY. The second one is the id of the biblio.record_entry row +# that corresponds to it. I don't have a good script for making that yet. +# SERC_TITLE_KEY can be different things (ISxN, 035‡a, etc) so you have to +# build the map by using various database queries (and hand de-duping). + +sub load_title_key_map { + my ($filename) = @_; + + open FH, "<$filename" or die "$filename: $!"; + + my $map = {}; + while () { + /^(\S+) (\d+)/ or next; + $map->{$1} = $2; + } + + close FH; + + return $map; +} + +############################# MAIN ############################### + +my $opts = { + "i" => "-", # input file + "p" => "dump", # operation + "t" => undef +}; +my $operations = { + "dump" => sub { # Just parse input file (serctl.data) and dump resulting + # data structure + my ($opts) = @_; + print Dumper(load_serctl_export($opts->{i})), "\n"; + + return 0; + }, + "keys" => sub { # Over the whole list of records, how often do all keys + # appear? This is an analysis tool. + my ($opts) = @_; + my $freq = unique_keys(load_serctl_export($opts->{i})); + + my @keys = reverse sort { $freq->{$a} <=> $freq->{$b} } keys %$freq; + + foreach (@keys) { + printf ("%-19s %d\n", $_, $freq->{$_}); + } + return 0; + }, + "map" => sub { # Combine records from input file with title key map and + # perform transformations suitable for evergreen import. + my ($opts) = @_; + + my $title_key_map = load_title_key_map($opts->{t}); + my $data = load_serctl_export($opts->{i}); + + foreach my $record (@$data) { + my $title_key = $record->{SERC_TITLE_KEY}; + if (!($record->{bre} = $title_key_map->{$title_key})) { + $record->{unready} = 1; + next; + } + + $record->{number_of_streams} = $record->{SERC_REC_COP}; + $record->{subscription_owning_lib} = $record->{SERC_LIB}; + $record->{distribtion_holding_lib} = substr($record->{HOLDING_CODE}, 0, 1); + + $record->{scap_active} = + $record->{SERC_STATUS} eq 'ACTIVE' ? 't' : 'f'; + $record->{scap_pattern_code} = record_bits_to_scap($record) if $record->{PUBCYCLE_DEF}; + + # save uppercase keys' values in a note + my @uc_keys = grep { $_ !~ /[a-z]/ } (keys %$record); + $record->{note_text} = join( + "\n", + map { "$_: $record->{$_}" } @uc_keys + ); + + # and remove uppercase keys now + delete $record->{$_} for @uc_keys; + } + + if ($opts->{u}) { + my $unready = [grep { $_->{unready} } @$data]; + if ($unready) { + open FH, ">$opts->{u}" or die "$opts->{u}: $!"; + print FH Dumper($unready), "\n"; + close FH; + } + } + + print Dumper($data), "\n"; + + return 0; + } +}; + +getopts("i:p:t:u:", $opts) or die("usage: $0\n\t[-i infile]\n\t[-p operation]\n\t[-t title_key_map]\n\t[-u dump_file_for_unmapped_records]"); + + +my $operation = lc $opts->{p}; +exit $operations->{$operation}->($opts) if exists $operations->{$operation}; +die ( + "specify a valid operation\n$0 -p [" . + join(" | ", (keys %$operations)) . "]" +); -- 1.7.2.5