Makefile.PL
README
lib/Equinox/Migration.pm
-lib/Equinox/Migration/MapDrivenXMLProc.pm
+lib/Equinox/Migration/MapDrivenMARCXMLProc.pm
+lib/Equinox/Migration/MARCXMLSampler.pm
lib/Equinox/Migration/SubfieldMapper.pm
lib/Equinox/Migration/SubfieldMapper.pm
t/00-load.t
-t/pod-coverage.t
-t/pod.t
--- /dev/null
+package Equinox::Migration::MARCXMLSampler;
+
+use warnings;
+use strict;
+
+use XML::Twig;
+use Equinox::Migration::SimpleTagList 1.001;
+
+# FIXME
+#
+# sample functionality should be extracted into a new module which
+# uses E::M::SM to drive sampling of individual datafields, and
+# reports ALL datafields which occur
+#
+# --sample should give the list of all datafields
+# --samplefile should take a SM map as teh argument and introspect the mapped datafields
+
+
+=head1 NAME
+
+Equinox::Migration::MARCXMLSampler
+
+=head1 VERSION
+
+Version 1.000
+
+=cut
+
+our $VERSION = '1.000';
+
+
+=head1 SYNOPSIS
+
+Foo
+
+ use Equinox::Migration::MARCXMLSampler;
+
+
+=head1 METHODS
+
+
+=head2 new
+
+=cut
+
+sub new {
+ my ($class, %args) = @_;
+
+ my $self = bless { data => { recs => undef, # X::T record objects
+ rcnt => 0, # next record counter
+ samp => {}, # data samples
+ tags => {}, # all found tags
+ },
+ }, $class;
+
+ # initialize twig
+ die "Argument 'marcfile' must be specified\n" unless ($args{marcfile});
+ if (-r $args{marcfile}) {
+ $self->{twig} = XML::Twig->new;
+ $self->{twig}->parsefile($args{marcfile});
+ my @records = $self->{twig}->root->children;
+ $self->{data}{recs} = \@records;
+ } else {
+ die "Can't open marc file: $!\n";
+ }
+
+ # if we have a sample arg, create the sample map
+ $self->{map} = Equinox::Migration::SimpleTagList->new(file => $args{mapfile})
+ if ($args{mapfile});
+ $self->{map} = Equinox::Migration::SimpleTagList->new(str => $args{mapstring})
+ if ($args{mapstring});
+
+ return $self;
+}
+
+
+=head2 parse_records
+
+Extracts data from MARC records, per the mapping file.
+
+=cut
+
+sub parse_records {
+ my ($self) = @_;
+
+ for my $record ( @{$self->{data}{recs}} ) {
+ my @fields = $record->children;
+ for my $f (@fields)
+ { $self->process_field($f) }
+
+ # cleanup memory and increment pointer
+ $record->purge;
+ $self->{data}{rcnt}++;
+ }
+}
+
+sub process_field {
+ my ($self, $field) = @_;
+ my $map = $self->{map};
+ my $tag = $field->{'att'}->{'tag'};
+ return unless ($tag and $tag > 9);
+
+ # increment raw tag count
+ $self->{data}{tags}{$tag}++;
+
+ if ($map and $map->has($tag)) {
+ my @subs = $field->children('subfield');
+ for my $sub (@subs)
+ { $self->process_subs($tag, $sub) }
+ }
+}
+
+sub process_subs {
+ my ($self, $tag, $sub) = @_;
+ my $map = $self->{map};
+ my $code = $sub->{'att'}->{'code'};
+
+ # handle unmapped tag/subs
+ my $samp = $self->{data}{samp};
+ # set a value, total-seen count and records-seen-in count
+ $samp->{$tag}{$code}{value} = $sub->text unless defined $samp->{$tag}{$code};
+ $samp->{$tag}{$code}{count}++;
+ $samp->{$tag}{$code}{rcnt}++ unless ( defined $samp->{$tag}{$code}{last} and
+ $samp->{$tag}{$code}{last} == $self->{data}{rcnt} );
+ $samp->{$tag}{$code}{last} = $self->{data}{rcnt};
+}
+
+
+=head1 SAMPLED TAGS
+
+If the C<sample> argument is passed to L</new>, there will also be a
+structure which holds data about unmapped subfields encountered in
+mapped tags which are also in the declared sample set. This
+information is collected over the life of the object and is not reset
+for every record processed (as the current record data neccessarily
+is).
+
+ { tag_id => {
+ sub_code => { value => VALUE,
+ count => COUNT,
+ rcnt => RCOUNT
+ },
+ ...
+ },
+ ...
+ }
+
+For each mapped tag, for each unmapped subfield, there is a hash of
+data about that subfield containing
+
+ * value - A sample of the subfield text
+ * count - Total number of times the subfield was seen
+ * rcnt - The number of records the subfield was seen in
+
+=head1 AUTHOR
+
+Shawn Boyette, C<< <sboyette at esilibrary.com> >>
+
+=head1 BUGS
+
+Please report any bugs or feature requests to the above email address.
+
+=head1 SUPPORT
+
+You can find documentation for this module with the perldoc command.
+
+ perldoc Equinox::Migration::MARCXMLSampler
+
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2009 Equinox, all rights reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+
+=cut
+
+1; # End of Equinox::Migration::MARCXMLSampler
my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
marcfile => FILE );
-There is an optional third, argument, C<sample>, which specifies a
-arrayref of datafields to "sample" by reporting on subfields which are
-found in the data but not in the map.
-
- my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
- marcfile => FILE,
- sample => \@TAGS
- );
-
-See L</UNMAPPED TAGS> for more info.
-
=cut
sub new {
data => { recs => undef, # X::T record objects
rptr => 0, # next record pointer
crec => undef, # parsed record storage
- stag => undef, # list of tags to sample
- umap => undef, # unmapped data samples
},
}, $class;
die "Can't open marc file: $!\n";
}
- # if we have a sample arg, set up the sample set and umap hash
- if (defined $args{sample}) {
- for my $s ( @{$args{sample}})
- { $self->{data}{stag}{$s} = 1 }
- $self->{data}{umap} = {};
- }
-
return $self;
}
my $code = $sub->{'att'}->{'code'};
# handle unmapped tag/subs
- unless ($map->has($tag, $code)) {
- my $u = $self->{data}{umap};
- my $s = $self->{data}{stag};
- return unless (defined $s->{$tag});
-
- # set a value, total-seen count and records-seen-in count
- $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
- $u->{$tag}{$code}{count}++;
- $u->{$tag}{$code}{rcnt}++ unless ( defined $u->{$tag}{$code}{last} and
- $u->{$tag}{$code}{last} == $self->{data}{rptr} );
- $u->{$tag}{$code}{last} = $self->{data}{rptr};
- return;
- }
+ return unless ($map->has($tag, $code));
# fetch our datafield struct and fieldname
my $dataf = $self->{data}{crec}{tags}[-1];
}
}
- die "Multiple occurances of a non-multi field: $tag$code at rec ",($self->{data}{rptr} + 1),"\n"
- if (defined $dataf->{uni}{$code});
+ die "Multiple occurances of a non-multi field: $tag$code at rec ",
+ ($self->{data}{rptr} + 1),"\n" if (defined $dataf->{uni}{$code});
$dataf->{uni}{$code} = $sub->text;
}
record struct. Oppose subfields which are not mapped, which will be
C<undef>.
-=head1 UNMAPPED TAGS
-
-If the C<sample> argument is passed to L</new>, there will also be a
-structure which holds data about unmapped subfields encountered in
-mapped tags which are also in the declared sample set. This
-information is collected over the life of the object and is not reset
-for every record processed (as the current record data neccessarily
-is).
-
- { tag_id => {
- sub_code => { value => VALUE,
- count => COUNT,
- rcnt => RCOUNT
- },
- ...
- },
- ...
- }
-
-For each mapped tag, for each unmapped subfield, there is a hash of
-data about that subfield containing
-
- * value - A sample of the subfield text
- * count - Total number of times the subfield was seen
- * rcnt - The number of records the subfield was seen in
=head1 AUTHOR
} else {
die "Can't open tags file: $!\n";
}
- } elsif ($args{str}) {
+ }elsif ($args{str}) {
$self->generate($args{str},'scalar');
}
my ($self, $file, $scalar) = @_;
if ($scalar) {
- open TAGFILE, '<:scalar', $file;
+ open TAGFILE, '<', \$file;
} else {
open TAGFILE, '<', $file;
}
# file with bad token
$. = 0;
$stl = Equinox::Migration::SimpleTagList->new;
-$stl->{conf}{file} = "./t/corpus/stl-2.txt";
-eval {$stl->generate};
+eval {$stl->generate("./t/corpus/stl-2.txt")};
is ($@, "Unknown chunk fnord in tags file (line 1)\n");
# file with except in wrong place
$. = 0;
$stl = Equinox::Migration::SimpleTagList->new;
-$stl->{conf}{file} = "./t/corpus/stl-3.txt";
-eval {$stl->generate};
+eval {$stl->generate("./t/corpus/stl-3.txt")};
is ($@, "Keyword 'except' can only follow a range (line 1)\n");
--- /dev/null
+#!perl -T
+
+#use Test::More tests => 39;
+use Test::More qw(no_plan);
+use Equinox::Migration::MARCXMLSampler;
+
+# fails
+eval { my $mp =
+ Equinox::Migration::MARCXMLSampler->new(tagfile => 't/corpus/mdmpmap-00.txt') };
+is ($@, "Argument 'marcfile' must be specified\n", 'no marcfile');
+
+
+# baseline object creation
+my $mp = Equinox::Migration::MARCXMLSampler->new( marcfile => 't/corpus/mdmp-0.txt');
+is(ref $mp, "Equinox::Migration::MARCXMLSampler", "self is self");
+
+# simple, original sample tests inherited from MDMP
+$mp = Equinox::Migration::MARCXMLSampler->new( marcfile => 't/corpus/mdmp-0.txt',
+ mapstring => '999',
+ );
+$mp->parse_records;
+my $sample = $mp->{data}{samp};
+is (defined $sample->{999}, 1);
+is (defined $sample->{999}{x}, 1);
+is ($sample->{999}{x}{value}, 'MYSTERY', 'Should be the first seen value');
+is ($sample->{999}{x}{count}, 7, 'One real in each record, plus 3 synthetic in last rec');
+is ($sample->{999}{x}{rcnt}, 4, 'Occurs in all records');
+is ($sample->{999}{s}{rcnt}, 3, 'Was removed from one record');
+
+my $tags = $mp->{data}{tags};
+is ($tags->{961}, 4);
+is ($tags->{250}, 1);