package Equinox::Migration::MARCXMLSampler;
+# Copyright 2009-2012, Equinox Software, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
use warnings;
use strict;
use XML::Twig;
use Equinox::Migration::SimpleTagList 1.001;
-# FIXME
-#
-# sample functionality should be extracted into a new module which
-# uses E::M::SM to drive sampling of individual datafields, and
-# reports ALL datafields which occur
-#
-# --sample should give the list of all datafields
-# --samplefile should take a SM map as teh argument and introspect the mapped datafields
-
=head1 NAME
=head1 VERSION
-Version 1.000
+Version 1.003
=cut
-our $VERSION = '1.000';
+our $VERSION = '1.003';
+
+my $taglist;
+my $dstore;
=head1 SYNOPSIS
-Foo
+Produce a list of all fields in a MARCXML file which have a C<tag>
+attribute, and count how many times each occurs
+
+ my $s = E::M::MARCXMLSampler->new( marcfile => "foo.marc.xml" );
+ $s->parse_records;
+
+Also deeply introspect certain tags, producing lists of all subfields,
+and counts of how many times each subfield occurs I<in toto> and how
+many records each subfield appears in
- use Equinox::Migration::MARCXMLSampler;
+ my $s = E::M::MARCXMLSampler->new( marcfile => "foo.marc.xml",
+ mapfile => "foo.map" );
+ ~ or ~
+
+ my $s = E::M::MARCXMLSampler->new( marcfile => "foo.marc.xml",
+ mapstring => "852 999" );
+ $s->parse_records;
=head1 METHODS
=head2 new
+Takes one required argument, C<marcfile>, which points to the MARCXML
+file to be processed.
+
+Has two mutually-exclusive optional arguments, C<mapfile> and
+C<mapstring>". The former should point to a file which will be used as
+a L<Equinox::Migration::SimpleTagList> map; the latter should have as
+its value a text string which will be used in the same way (handy for
+when you only want deep introspection on a handful of tags).
+
=cut
sub new {
my ($class, %args) = @_;
- my $self = bless { data => { recs => undef, # X::T record objects
- rcnt => 0, # next record counter
- samp => {}, # data samples
- tags => {}, # all found tags
- },
+ $dstore = { rcnt => 0, # record counter
+ tcnt => 0, # tag counter
+ scnt => {}, # subfield/tag counters
+ samp => {}, # data samples
+ tags => {}, # all found tags
+ };
+
+ my $self = bless { data => $dstore,
}, $class;
- # initialize twig
+ # if we have a sample arg, create the sample map
+ die "Can't use a mapfile and mapstring\n"
+ if ($args{mapfile} and $args{mapstring});
+ $taglist = Equinox::Migration::SimpleTagList->new(file => $args{mapfile})
+ if ($args{mapfile});
+ $taglist = Equinox::Migration::SimpleTagList->new(str => $args{mapstring})
+ if ($args{mapstring});
+
+ # initialize twig and process xml
die "Argument 'marcfile' must be specified\n" unless ($args{marcfile});
if (-r $args{marcfile}) {
- $self->{twig} = XML::Twig->new;
- $self->{twig}->parsefile($args{marcfile});
- my @records = $self->{twig}->root->children;
- $self->{data}{recs} = \@records;
+ my $xmltwig = XML::Twig->new( twig_handlers => { record => \&parse_record } );
+ $xmltwig->parsefile( $args{marcfile} );
} else {
die "Can't open marc file: $!\n";
}
- # if we have a sample arg, create the sample map
- $self->{map} = Equinox::Migration::SimpleTagList->new(file => $args{mapfile})
- if ($args{mapfile});
- $self->{map} = Equinox::Migration::SimpleTagList->new(str => $args{mapstring})
- if ($args{mapstring});
-
+ # hand ourselves back for datastore manipulation
return $self;
}
-=head2 parse_records
+=head2 parse_record
-Extracts data from MARC records, per the mapping file.
+XML::Twig handler for record elements; drives data extraction process.
=cut
-sub parse_records {
- my ($self) = @_;
+sub parse_record {
+ my ($twig, $record) = @_;
- for my $record ( @{$self->{data}{recs}} ) {
- my @fields = $record->children;
- for my $f (@fields)
- { $self->process_field($f) }
+ my @fields = $record->children;
+ for my $f (@fields)
+ { process_field($f) }
- # cleanup memory and increment pointer
- $record->purge;
- $self->{data}{rcnt}++;
- }
+ # cleanup memory and increment pointer
+ $record->purge;
+ $dstore->{rcnt}++;
}
+
sub process_field {
- my ($self, $field) = @_;
- my $map = $self->{map};
+ my ($field) = @_;
my $tag = $field->{'att'}->{'tag'};
- return unless ($tag and $tag > 9);
+ return unless ($tag and ($tag =~ /[^0-9]/ or $tag > 9));
# increment raw tag count
- $self->{data}{tags}{$tag}++;
+ $dstore->{tcnt}++;
+ $dstore->{tags}{$tag}++;
+
- if ($map and $map->has($tag)) {
+ if ($taglist and $taglist->has($tag)) {
my @subs = $field->children('subfield');
+ my $i= 0;
for my $sub (@subs)
- { $self->process_subs($tag, $sub) }
+ { process_subs($tag, $sub); $i++ }
+
+ # increment sub length counter
+ $dstore->{scnt}{$tag}{$i}++;
}
}
sub process_subs {
- my ($self, $tag, $sub) = @_;
- my $map = $self->{map};
+ my ($tag, $sub) = @_;
my $code = $sub->{'att'}->{'code'};
# handle unmapped tag/subs
- my $samp = $self->{data}{samp};
+ my $samp = $dstore->{samp};
# set a value, total-seen count and records-seen-in count
- $samp->{$tag}{$code}{value} = $sub->text unless defined $samp->{$tag}{$code};
+ $samp->{$tag}{$code}{value} = $sub->text unless ($samp->{$tag}{$code}{value} and
+ $samp->{$tag}{$code}{value} =~ /\w/);
$samp->{$tag}{$code}{count}++;
- $samp->{$tag}{$code}{rcnt}++ unless ( defined $samp->{$tag}{$code}{last} and
- $samp->{$tag}{$code}{last} == $self->{data}{rcnt} );
- $samp->{$tag}{$code}{last} = $self->{data}{rcnt};
+ $samp->{$tag}{$code}{tcnt}++ unless ( defined $samp->{$tag}{$code}{last} and
+ $samp->{$tag}{$code}{last} == $dstore->{tcnt} );
+ $samp->{$tag}{$code}{last} = $dstore->{tcnt};
}
=head1 SAMPLED TAGS
-If the C<sample> argument is passed to L</new>, there will also be a
-structure which holds data about unmapped subfields encountered in
-mapped tags which are also in the declared sample set. This
-information is collected over the life of the object and is not reset
-for every record processed (as the current record data neccessarily
-is).
+If the C<mapfile> or C<mapstring> arguments are passed to L</new>, a
+structure will be constructed which holds data about tags in the map.
{ tag_id => {
sub_code => { value => VALUE,
count => COUNT,
- rcnt => RCOUNT
+ tcnt => TAGCOUNT
},
...
},
...
}
-For each mapped tag, for each unmapped subfield, there is a hash of
-data about that subfield containing
+For each subfield in each mapped tag, there is a hash of data about
+that subfield containing
* value - A sample of the subfield text
* count - Total number of times the subfield was seen
- * rcnt - The number of records the subfield was seen in
+ * tcnt - The number of tags the subfield was seen in
=head1 AUTHOR