From c4f56550499fc26ed1f5f5aeabc972ed8f9b27ec Mon Sep 17 00:00:00 2001 From: Shawn Boyette Date: Tue, 28 Apr 2009 02:31:51 +0000 Subject: [PATCH] birthing MARCXMLSampler --- Equinox-Migration/MANIFEST | 5 +- .../lib/Equinox/Migration/MARCXMLSampler.pm | 180 ++++++++++++++++++++ .../lib/Equinox/Migration/MapDrivenMARCXMLProc.pm | 63 +------- .../lib/Equinox/Migration/SimpleTagList.pm | 4 +- Equinox-Migration/t/01-SimpleTagList.t | 6 +- Equinox-Migration/t/04-MARCXMLSampler.t | 32 ++++ 6 files changed, 221 insertions(+), 69 deletions(-) create mode 100644 Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm create mode 100644 Equinox-Migration/t/04-MARCXMLSampler.t diff --git a/Equinox-Migration/MANIFEST b/Equinox-Migration/MANIFEST index 7d1dc1c..b5b7c15 100644 --- a/Equinox-Migration/MANIFEST +++ b/Equinox-Migration/MANIFEST @@ -3,9 +3,8 @@ MANIFEST Makefile.PL README lib/Equinox/Migration.pm -lib/Equinox/Migration/MapDrivenXMLProc.pm +lib/Equinox/Migration/MapDrivenMARCXMLProc.pm +lib/Equinox/Migration/MARCXMLSampler.pm lib/Equinox/Migration/SubfieldMapper.pm lib/Equinox/Migration/SubfieldMapper.pm t/00-load.t -t/pod-coverage.t -t/pod.t diff --git a/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm b/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm new file mode 100644 index 0000000..e0cc670 --- /dev/null +++ b/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm @@ -0,0 +1,180 @@ +package Equinox::Migration::MARCXMLSampler; + +use warnings; +use strict; + +use XML::Twig; +use Equinox::Migration::SimpleTagList 1.001; + +# FIXME +# +# sample functionality should be extracted into a new module which +# uses E::M::SM to drive sampling of individual datafields, and +# reports ALL datafields which occur +# +# --sample should give the list of all datafields +# --samplefile should take a SM map as teh argument and introspect the mapped datafields + + +=head1 NAME + +Equinox::Migration::MARCXMLSampler + +=head1 VERSION + +Version 1.000 + +=cut + +our $VERSION = '1.000'; + + +=head1 SYNOPSIS + +Foo + + use Equinox::Migration::MARCXMLSampler; + + +=head1 METHODS + + +=head2 new + +=cut + +sub new { + my ($class, %args) = @_; + + my $self = bless { data => { recs => undef, # X::T record objects + rcnt => 0, # next record counter + samp => {}, # data samples + tags => {}, # all found tags + }, + }, $class; + + # initialize twig + die "Argument 'marcfile' must be specified\n" unless ($args{marcfile}); + if (-r $args{marcfile}) { + $self->{twig} = XML::Twig->new; + $self->{twig}->parsefile($args{marcfile}); + my @records = $self->{twig}->root->children; + $self->{data}{recs} = \@records; + } else { + die "Can't open marc file: $!\n"; + } + + # if we have a sample arg, create the sample map + $self->{map} = Equinox::Migration::SimpleTagList->new(file => $args{mapfile}) + if ($args{mapfile}); + $self->{map} = Equinox::Migration::SimpleTagList->new(str => $args{mapstring}) + if ($args{mapstring}); + + return $self; +} + + +=head2 parse_records + +Extracts data from MARC records, per the mapping file. + +=cut + +sub parse_records { + my ($self) = @_; + + for my $record ( @{$self->{data}{recs}} ) { + my @fields = $record->children; + for my $f (@fields) + { $self->process_field($f) } + + # cleanup memory and increment pointer + $record->purge; + $self->{data}{rcnt}++; + } +} + +sub process_field { + my ($self, $field) = @_; + my $map = $self->{map}; + my $tag = $field->{'att'}->{'tag'}; + return unless ($tag and $tag > 9); + + # increment raw tag count + $self->{data}{tags}{$tag}++; + + if ($map and $map->has($tag)) { + my @subs = $field->children('subfield'); + for my $sub (@subs) + { $self->process_subs($tag, $sub) } + } +} + +sub process_subs { + my ($self, $tag, $sub) = @_; + my $map = $self->{map}; + my $code = $sub->{'att'}->{'code'}; + + # handle unmapped tag/subs + my $samp = $self->{data}{samp}; + # set a value, total-seen count and records-seen-in count + $samp->{$tag}{$code}{value} = $sub->text unless defined $samp->{$tag}{$code}; + $samp->{$tag}{$code}{count}++; + $samp->{$tag}{$code}{rcnt}++ unless ( defined $samp->{$tag}{$code}{last} and + $samp->{$tag}{$code}{last} == $self->{data}{rcnt} ); + $samp->{$tag}{$code}{last} = $self->{data}{rcnt}; +} + + +=head1 SAMPLED TAGS + +If the C argument is passed to L, there will also be a +structure which holds data about unmapped subfields encountered in +mapped tags which are also in the declared sample set. This +information is collected over the life of the object and is not reset +for every record processed (as the current record data neccessarily +is). + + { tag_id => { + sub_code => { value => VALUE, + count => COUNT, + rcnt => RCOUNT + }, + ... + }, + ... + } + +For each mapped tag, for each unmapped subfield, there is a hash of +data about that subfield containing + + * value - A sample of the subfield text + * count - Total number of times the subfield was seen + * rcnt - The number of records the subfield was seen in + +=head1 AUTHOR + +Shawn Boyette, C<< >> + +=head1 BUGS + +Please report any bugs or feature requests to the above email address. + +=head1 SUPPORT + +You can find documentation for this module with the perldoc command. + + perldoc Equinox::Migration::MARCXMLSampler + + +=head1 COPYRIGHT & LICENSE + +Copyright 2009 Equinox, all rights reserved. + +This program is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + + +=cut + +1; # End of Equinox::Migration::MARCXMLSampler diff --git a/Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm b/Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm index 236036e..904ac25 100644 --- a/Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm +++ b/Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm @@ -48,17 +48,6 @@ and C (the MARC data to be processed). my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE, marcfile => FILE ); -There is an optional third, argument, C, which specifies a -arrayref of datafields to "sample" by reporting on subfields which are -found in the data but not in the map. - - my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE, - marcfile => FILE, - sample => \@TAGS - ); - -See L for more info. - =cut sub new { @@ -71,8 +60,6 @@ sub new { data => { recs => undef, # X::T record objects rptr => 0, # next record pointer crec => undef, # parsed record storage - stag => undef, # list of tags to sample - umap => undef, # unmapped data samples }, }, $class; @@ -94,13 +81,6 @@ sub new { die "Can't open marc file: $!\n"; } - # if we have a sample arg, set up the sample set and umap hash - if (defined $args{sample}) { - for my $s ( @{$args{sample}}) - { $self->{data}{stag}{$s} = 1 } - $self->{data}{umap} = {}; - } - return $self; } @@ -178,19 +158,7 @@ sub process_subs { my $code = $sub->{'att'}->{'code'}; # handle unmapped tag/subs - unless ($map->has($tag, $code)) { - my $u = $self->{data}{umap}; - my $s = $self->{data}{stag}; - return unless (defined $s->{$tag}); - - # set a value, total-seen count and records-seen-in count - $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code}; - $u->{$tag}{$code}{count}++; - $u->{$tag}{$code}{rcnt}++ unless ( defined $u->{$tag}{$code}{last} and - $u->{$tag}{$code}{last} == $self->{data}{rptr} ); - $u->{$tag}{$code}{last} = $self->{data}{rptr}; - return; - } + return unless ($map->has($tag, $code)); # fetch our datafield struct and fieldname my $dataf = $self->{data}{crec}{tags}[-1]; @@ -205,8 +173,8 @@ sub process_subs { } } - die "Multiple occurances of a non-multi field: $tag$code at rec ",($self->{data}{rptr} + 1),"\n" - if (defined $dataf->{uni}{$code}); + die "Multiple occurances of a non-multi field: $tag$code at rec ", + ($self->{data}{rptr} + 1),"\n" if (defined $dataf->{uni}{$code}); $dataf->{uni}{$code} = $sub->text; } @@ -332,31 +300,6 @@ datafield will be given a value of '' (the null string) in the current record struct. Oppose subfields which are not mapped, which will be C. -=head1 UNMAPPED TAGS - -If the C argument is passed to L, there will also be a -structure which holds data about unmapped subfields encountered in -mapped tags which are also in the declared sample set. This -information is collected over the life of the object and is not reset -for every record processed (as the current record data neccessarily -is). - - { tag_id => { - sub_code => { value => VALUE, - count => COUNT, - rcnt => RCOUNT - }, - ... - }, - ... - } - -For each mapped tag, for each unmapped subfield, there is a hash of -data about that subfield containing - - * value - A sample of the subfield text - * count - Total number of times the subfield was seen - * rcnt - The number of records the subfield was seen in =head1 AUTHOR diff --git a/Equinox-Migration/lib/Equinox/Migration/SimpleTagList.pm b/Equinox-Migration/lib/Equinox/Migration/SimpleTagList.pm index 596ec9b..0b82c0a 100644 --- a/Equinox-Migration/lib/Equinox/Migration/SimpleTagList.pm +++ b/Equinox-Migration/lib/Equinox/Migration/SimpleTagList.pm @@ -62,7 +62,7 @@ sub new { } else { die "Can't open tags file: $!\n"; } - } elsif ($args{str}) { + }elsif ($args{str}) { $self->generate($args{str},'scalar'); } @@ -109,7 +109,7 @@ sub generate { my ($self, $file, $scalar) = @_; if ($scalar) { - open TAGFILE, '<:scalar', $file; + open TAGFILE, '<', \$file; } else { open TAGFILE, '<', $file; } diff --git a/Equinox-Migration/t/01-SimpleTagList.t b/Equinox-Migration/t/01-SimpleTagList.t index 69a1994..8ef29c0 100644 --- a/Equinox-Migration/t/01-SimpleTagList.t +++ b/Equinox-Migration/t/01-SimpleTagList.t @@ -90,13 +90,11 @@ is ($stl->has(304), 0, 'exception'); # file with bad token $. = 0; $stl = Equinox::Migration::SimpleTagList->new; -$stl->{conf}{file} = "./t/corpus/stl-2.txt"; -eval {$stl->generate}; +eval {$stl->generate("./t/corpus/stl-2.txt")}; is ($@, "Unknown chunk fnord in tags file (line 1)\n"); # file with except in wrong place $. = 0; $stl = Equinox::Migration::SimpleTagList->new; -$stl->{conf}{file} = "./t/corpus/stl-3.txt"; -eval {$stl->generate}; +eval {$stl->generate("./t/corpus/stl-3.txt")}; is ($@, "Keyword 'except' can only follow a range (line 1)\n"); diff --git a/Equinox-Migration/t/04-MARCXMLSampler.t b/Equinox-Migration/t/04-MARCXMLSampler.t new file mode 100644 index 0000000..d1251aa --- /dev/null +++ b/Equinox-Migration/t/04-MARCXMLSampler.t @@ -0,0 +1,32 @@ +#!perl -T + +#use Test::More tests => 39; +use Test::More qw(no_plan); +use Equinox::Migration::MARCXMLSampler; + +# fails +eval { my $mp = + Equinox::Migration::MARCXMLSampler->new(tagfile => 't/corpus/mdmpmap-00.txt') }; +is ($@, "Argument 'marcfile' must be specified\n", 'no marcfile'); + + +# baseline object creation +my $mp = Equinox::Migration::MARCXMLSampler->new( marcfile => 't/corpus/mdmp-0.txt'); +is(ref $mp, "Equinox::Migration::MARCXMLSampler", "self is self"); + +# simple, original sample tests inherited from MDMP +$mp = Equinox::Migration::MARCXMLSampler->new( marcfile => 't/corpus/mdmp-0.txt', + mapstring => '999', + ); +$mp->parse_records; +my $sample = $mp->{data}{samp}; +is (defined $sample->{999}, 1); +is (defined $sample->{999}{x}, 1); +is ($sample->{999}{x}{value}, 'MYSTERY', 'Should be the first seen value'); +is ($sample->{999}{x}{count}, 7, 'One real in each record, plus 3 synthetic in last rec'); +is ($sample->{999}{x}{rcnt}, 4, 'Occurs in all records'); +is ($sample->{999}{s}{rcnt}, 3, 'Was removed from one record'); + +my $tags = $mp->{data}{tags}; +is ($tags->{961}, 4); +is ($tags->{250}, 1); -- 1.7.2.5