X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=Equinox-Migration%2Flib%2FEquinox%2FMigration%2FMARCXMLSampler.pm;h=1cd8b5c0dddc34b6c0818a6e5b58d2c1f9ae2f51;hp=ef89c6afa2040d7feb951c7aa1ea460cd1577917;hb=1c8c72dc5c4f99e6eacd61c4ddbf96e835a79cf1;hpb=8e63d0bddf6634ebf2a8d5b8e160c285c6676d0e diff --git a/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm b/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm index ef89c6a..1cd8b5c 100644 --- a/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm +++ b/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm @@ -13,18 +13,32 @@ Equinox::Migration::MARCXMLSampler =head1 VERSION -Version 1.000 +Version 1.002 =cut -our $VERSION = '1.000'; +our $VERSION = '1.002'; =head1 SYNOPSIS -Foo +Produce a list of all fields in a MARCXML file which have a C +attribute, and count how many times each occurs - use Equinox::Migration::MARCXMLSampler; + my $s = E::M::MARCXMLSampler->new( marcfile => "foo.marc.xml" ); + $s->parse_records; + +Also deeply introspect certain tags, producing lists of all subfields, +and counts of how many times each subfield occurs I and how +many records each subfield appears in + + my $s = E::M::MARCXMLSampler->new( marcfile => "foo.marc.xml", + mapfile => "foo.map" ); + ~ or ~ + + my $s = E::M::MARCXMLSampler->new( marcfile => "foo.marc.xml", + mapstring => "852 999" ); + $s->parse_records; =head1 METHODS @@ -32,13 +46,24 @@ Foo =head2 new +Takes one required argument, C, which points to the MARCXML +file to be processed. + +Has two mutually-exclusive optional arguments, C and +C". The former should point to a file which will be used as +a L map; the latter should have as +its value a text string which will be used in the same way (handy for +when you only want deep introspection on a handful of tags). + =cut sub new { my ($class, %args) = @_; my $self = bless { data => { recs => undef, # X::T record objects - rcnt => 0, # next record counter + rcnt => 0, # record counter + tcnt => 0, # tag counter + scnt => {}, # subfield/tag counters samp => {}, # data samples tags => {}, # all found tags }, @@ -48,14 +73,14 @@ sub new { die "Argument 'marcfile' must be specified\n" unless ($args{marcfile}); if (-r $args{marcfile}) { $self->{twig} = XML::Twig->new; - $self->{twig}->parsefile($args{marcfile}); - my @records = $self->{twig}->root->children; - $self->{data}{recs} = \@records; + $self->{conf}{marc} = $args{marcfile}; } else { die "Can't open marc file: $!\n"; } # if we have a sample arg, create the sample map + die "Can't use a mapfile and mapstring\n" + if ($args{mapfile} and $args{mapstring}); $self->{map} = Equinox::Migration::SimpleTagList->new(file => $args{mapfile}) if ($args{mapfile}); $self->{map} = Equinox::Migration::SimpleTagList->new(str => $args{mapstring}) @@ -74,10 +99,11 @@ Extracts data from MARC records, per the mapping file. sub parse_records { my ($self) = @_; - for my $record ( @{$self->{data}{recs}} ) { + $self->{twig}->parsefile( $self->{conf}{marc} ); + for my $record ( $self->{twig}->root->children ) { my @fields = $record->children; for my $f (@fields) - { $self->process_field($f) } + { $self->process_field($f); $f->purge } # cleanup memory and increment pointer $record->purge; @@ -92,12 +118,17 @@ sub process_field { return unless ($tag and $tag > 9); # increment raw tag count + $self->{data}{tcnt}++; $self->{data}{tags}{$tag}++; if ($map and $map->has($tag)) { my @subs = $field->children('subfield'); + my $i= 0; for my $sub (@subs) - { $self->process_subs($tag, $sub) } + { $self->process_subs($tag, $sub); $sub->purge; $i++ } + + # increment sub length counter + $self->{data}{scnt}{$tag}{$i}++; } } @@ -109,39 +140,35 @@ sub process_subs { # handle unmapped tag/subs my $samp = $self->{data}{samp}; # set a value, total-seen count and records-seen-in count - $samp->{$tag}{$code}{value} = $sub->text unless defined $samp->{$tag}{$code}; + $samp->{$tag}{$code}{value} = $sub->text unless $samp->{$tag}{$code}; $samp->{$tag}{$code}{count}++; - $samp->{$tag}{$code}{rcnt}++ unless ( defined $samp->{$tag}{$code}{last} and - $samp->{$tag}{$code}{last} == $self->{data}{rcnt} ); - $samp->{$tag}{$code}{last} = $self->{data}{rcnt}; + $samp->{$tag}{$code}{tcnt}++ unless ( defined $samp->{$tag}{$code}{last} and + $samp->{$tag}{$code}{last} == $self->{data}{tcnt} ); + $samp->{$tag}{$code}{last} = $self->{data}{tcnt}; } =head1 SAMPLED TAGS -If the C argument is passed to L, there will also be a -structure which holds data about unmapped subfields encountered in -mapped tags which are also in the declared sample set. This -information is collected over the life of the object and is not reset -for every record processed (as the current record data neccessarily -is). +If the C or C arguments are passed to L, a +structure will be constructed which holds data about tags in the map. { tag_id => { sub_code => { value => VALUE, count => COUNT, - rcnt => RCOUNT + tcnt => TAGCOUNT }, ... }, ... } -For each mapped tag, for each unmapped subfield, there is a hash of -data about that subfield containing +For each subfield in each mapped tag, there is a hash of data about +that subfield containing * value - A sample of the subfield text * count - Total number of times the subfield was seen - * rcnt - The number of records the subfield was seen in + * tcnt - The number of tags the subfield was seen in =head1 AUTHOR