X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=Equinox-Migration%2Flib%2FEquinox%2FMigration%2FMARCXMLSampler.pm;h=60f5ec95b5d5c65f226ad3fdb462e557426efb28;hp=e4243e694a1458d3df704eb96c3cad9a87b46411;hb=7af21df77fb31c255c21c4a37addf7c645d42731;hpb=5cb143f1fc584a55e4732dbc747c875367baa241 diff --git a/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm b/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm index e4243e6..60f5ec9 100644 --- a/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm +++ b/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm @@ -1,5 +1,21 @@ package Equinox::Migration::MARCXMLSampler; +# Copyright 2009-2012, Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + use warnings; use strict; @@ -13,11 +29,14 @@ Equinox::Migration::MARCXMLSampler =head1 VERSION -Version 1.000 +Version 1.003 =cut -our $VERSION = '1.000'; +our $VERSION = '1.003'; + +my $taglist; +my $dstore; =head1 SYNOPSIS @@ -60,87 +79,91 @@ when you only want deep introspection on a handful of tags). sub new { my ($class, %args) = @_; - my $self = bless { data => { recs => undef, # X::T record objects - rcnt => 0, # record counter - tcnt => 0, # tag counter - samp => {}, # data samples - tags => {}, # all found tags - }, - }, $class; + $dstore = { rcnt => 0, # record counter + tcnt => 0, # tag counter + scnt => {}, # subfield/tag counters + samp => {}, # data samples + tags => {}, # all found tags + }; - # initialize twig - die "Argument 'marcfile' must be specified\n" unless ($args{marcfile}); - if (-r $args{marcfile}) { - $self->{twig} = XML::Twig->new; - $self->{conf}{marc} = $args{marcfile}; - } else { - die "Can't open marc file: $!\n"; - } + my $self = bless { data => $dstore, + }, $class; # if we have a sample arg, create the sample map die "Can't use a mapfile and mapstring\n" if ($args{mapfile} and $args{mapstring}); - $self->{map} = Equinox::Migration::SimpleTagList->new(file => $args{mapfile}) + $taglist = Equinox::Migration::SimpleTagList->new(file => $args{mapfile}) if ($args{mapfile}); - $self->{map} = Equinox::Migration::SimpleTagList->new(str => $args{mapstring}) + $taglist = Equinox::Migration::SimpleTagList->new(str => $args{mapstring}) if ($args{mapstring}); + # initialize twig and process xml + die "Argument 'marcfile' must be specified\n" unless ($args{marcfile}); + if (-r $args{marcfile}) { + my $xmltwig = XML::Twig->new( twig_handlers => { record => \&parse_record } ); + $xmltwig->parsefile( $args{marcfile} ); + } else { + die "Can't open marc file: $!\n"; + } + + # hand ourselves back for datastore manipulation return $self; } -=head2 parse_records +=head2 parse_record -Extracts data from MARC records, per the mapping file. +XML::Twig handler for record elements; drives data extraction process. =cut -sub parse_records { - my ($self) = @_; +sub parse_record { + my ($twig, $record) = @_; - $self->{twig}->parsefile( $self->{conf}{marc} ); - for my $record ( $self->{twig}->root->children ) { - my @fields = $record->children; - for my $f (@fields) - { $self->process_field($f); $f->purge } + my @fields = $record->children; + for my $f (@fields) + { process_field($f) } - # cleanup memory and increment pointer - $record->purge; - $self->{data}{rcnt}++; - } + # cleanup memory and increment pointer + $record->purge; + $dstore->{rcnt}++; } + sub process_field { - my ($self, $field) = @_; - my $map = $self->{map}; + my ($field) = @_; my $tag = $field->{'att'}->{'tag'}; - return unless ($tag and $tag > 9); + return unless ($tag and ($tag =~ /[^0-9]/ or $tag > 9)); # increment raw tag count - $self->{data}{tcnt}++; - $self->{data}{tags}{$tag}++; + $dstore->{tcnt}++; + $dstore->{tags}{$tag}++; - if ($map and $map->has($tag)) { + + if ($taglist and $taglist->has($tag)) { my @subs = $field->children('subfield'); + my $i= 0; for my $sub (@subs) - { $self->process_subs($tag, $sub); $sub->purge } + { process_subs($tag, $sub); $i++ } + + # increment sub length counter + $dstore->{scnt}{$tag}{$i}++; } } sub process_subs { - my ($self, $tag, $sub) = @_; - my $map = $self->{map}; + my ($tag, $sub) = @_; my $code = $sub->{'att'}->{'code'}; # handle unmapped tag/subs - my $samp = $self->{data}{samp}; + my $samp = $dstore->{samp}; # set a value, total-seen count and records-seen-in count - $samp->{$tag}{$code}{value} = $sub->text unless defined $samp->{$tag}{$code}; + $samp->{$tag}{$code}{value} = $sub->text unless ($samp->{$tag}{$code}{value} and + $samp->{$tag}{$code}{value} =~ /\w/); $samp->{$tag}{$code}{count}++; $samp->{$tag}{$code}{tcnt}++ unless ( defined $samp->{$tag}{$code}{last} and - $samp->{$tag}{$code}{last} == $self->{data}{tcnt} ); - $samp->{$tag}{$code}{last} = $self->{data}{tcnt}; - #FIXME tcnt not rcnt + $samp->{$tag}{$code}{last} == $dstore->{tcnt} ); + $samp->{$tag}{$code}{last} = $dstore->{tcnt}; }