X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=Equinox-Migration%2Flib%2FEquinox%2FMigration%2FMARCXMLSampler.pm;h=1cd8b5c0dddc34b6c0818a6e5b58d2c1f9ae2f51;hp=1fbfefed0eaf6424ce45034d63562d9cb0caeccc;hb=1c8c72dc5c4f99e6eacd61c4ddbf96e835a79cf1;hpb=92480d2b668d7ddd2237cca8d4d6c20f5f2e43f9 diff --git a/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm b/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm index 1fbfefe..1cd8b5c 100644 --- a/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm +++ b/Equinox-Migration/lib/Equinox/Migration/MARCXMLSampler.pm @@ -13,11 +13,11 @@ Equinox::Migration::MARCXMLSampler =head1 VERSION -Version 1.000 +Version 1.002 =cut -our $VERSION = '1.000'; +our $VERSION = '1.002'; =head1 SYNOPSIS @@ -61,7 +61,9 @@ sub new { my ($class, %args) = @_; my $self = bless { data => { recs => undef, # X::T record objects - rcnt => 0, # next record counter + rcnt => 0, # record counter + tcnt => 0, # tag counter + scnt => {}, # subfield/tag counters samp => {}, # data samples tags => {}, # all found tags }, @@ -71,9 +73,7 @@ sub new { die "Argument 'marcfile' must be specified\n" unless ($args{marcfile}); if (-r $args{marcfile}) { $self->{twig} = XML::Twig->new; - $self->{twig}->parsefile($args{marcfile}); - my @records = $self->{twig}->root->children; - $self->{data}{recs} = \@records; + $self->{conf}{marc} = $args{marcfile}; } else { die "Can't open marc file: $!\n"; } @@ -99,10 +99,11 @@ Extracts data from MARC records, per the mapping file. sub parse_records { my ($self) = @_; - for my $record ( @{$self->{data}{recs}} ) { + $self->{twig}->parsefile( $self->{conf}{marc} ); + for my $record ( $self->{twig}->root->children ) { my @fields = $record->children; for my $f (@fields) - { $self->process_field($f) } + { $self->process_field($f); $f->purge } # cleanup memory and increment pointer $record->purge; @@ -117,12 +118,17 @@ sub process_field { return unless ($tag and $tag > 9); # increment raw tag count + $self->{data}{tcnt}++; $self->{data}{tags}{$tag}++; if ($map and $map->has($tag)) { my @subs = $field->children('subfield'); + my $i= 0; for my $sub (@subs) - { $self->process_subs($tag, $sub) } + { $self->process_subs($tag, $sub); $sub->purge; $i++ } + + # increment sub length counter + $self->{data}{scnt}{$tag}{$i}++; } } @@ -134,12 +140,11 @@ sub process_subs { # handle unmapped tag/subs my $samp = $self->{data}{samp}; # set a value, total-seen count and records-seen-in count - $samp->{$tag}{$code}{value} = $sub->text unless defined $samp->{$tag}{$code}; + $samp->{$tag}{$code}{value} = $sub->text unless $samp->{$tag}{$code}; $samp->{$tag}{$code}{count}++; - $samp->{$tag}{$code}{rcnt}++ unless ( defined $samp->{$tag}{$code}{last} and - $samp->{$tag}{$code}{last} == $self->{data}{rcnt} ); - $samp->{$tag}{$code}{last} = $self->{data}{rcnt}; - #FIXME tcnt not rcnt + $samp->{$tag}{$code}{tcnt}++ unless ( defined $samp->{$tag}{$code}{last} and + $samp->{$tag}{$code}{last} == $self->{data}{tcnt} ); + $samp->{$tag}{$code}{last} = $self->{data}{tcnt}; } @@ -151,7 +156,7 @@ structure will be constructed which holds data about tags in the map. { tag_id => { sub_code => { value => VALUE, count => COUNT, - rcnt => RCOUNT + tcnt => TAGCOUNT }, ... }, @@ -163,7 +168,7 @@ that subfield containing * value - A sample of the subfield text * count - Total number of times the subfield was seen - * rcnt - The number of records the subfield was seen in + * tcnt - The number of tags the subfield was seen in =head1 AUTHOR