X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=Equinox-Migration%2Flib%2FEquinox%2FMigration%2FMapDrivenMARCXMLProc.pm;h=9844daf68f02ef213b23c7d195801e177ea3cb4f;hp=2c789c9d24aa2e854c89dd38c5db4a33e844d73f;hb=ca9dd06ee372e59b4fe0ecdcbe24cf7468d8577f;hpb=df7e60bd03f18a58a67c8d46f15d91fa98a7e5de diff --git a/Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm b/Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm index 2c789c9..9844daf 100644 --- a/Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm +++ b/Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm @@ -4,7 +4,17 @@ use warnings; use strict; use XML::Twig; -use Equinox::Migration::SubfieldMapper; +use Equinox::Migration::SubfieldMapper 1.002; + +# FIXME +# +# sample functionality should be extracted into a new module which +# uses E::M::SM to drive sampling of individual datafields, and +# reports ALL datafields which occur +# +# --sample should give the list of all datafields +# --samplefile should take a SM map as teh argument and introspect the mapped datafields + =head1 NAME @@ -84,6 +94,13 @@ sub new { die "Can't open marc file: $!\n"; } + # if we have a sample arg, set up the sample set and umap hash + if (defined $args{sample}) { + for my $s ( @{$args{sample}}) + { $self->{data}{stag}{$s} = 1 } + $self->{data}{umap} = {}; + } + return $self; } @@ -119,35 +136,39 @@ sub parse_record { return $self->{data}{crec}; } -=head2 process_field - -=cut - sub process_field { my ($self, $field) = @_; my $map = $self->{map}; my $tag = $field->{'att'}->{'tag'}; my $crec = $self->{data}{crec}; + # leader + unless (defined $tag) { + #FIXME + return; + } + # datafields - if (defined $tag) { - if ($tag == 903) { - my $sub = $field->first_child('subfield'); - $crec->{egid} = $sub->text;; - } elsif ($map->has($tag)) { - push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef }; - my @subs = $field->children('subfield'); - for my $sub (@subs) - { $self->process_subs($tag, $sub) } - # check map to ensure all declared subs are in + if ($tag == 903) { + my $sub = $field->first_child('subfield'); + $crec->{egid} = $sub->text; + return; + } + if ($map->has($tag)) { + push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef }; + my @subs = $field->children('subfield'); + for my $sub (@subs) + { $self->process_subs($tag, $sub) } + # check map to ensure all declared subs have a value + my $mods = $map->mods($field); + for my $mappedsub ( @{ $map->subfields($tag) } ) { + next if $mods->{multi}; + $crec->{tags}[-1]{uni}{$mappedsub} = '' + unless defined $crec->{tags}[-1]{uni}{$mappedsub}; } } } -=head2 process_subs - -=cut - sub process_subs { my ($self, $tag, $sub) = @_; my $map = $self->{map}; @@ -162,27 +183,60 @@ sub process_subs { # set a value, total-seen count and records-seen-in count $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code}; $u->{$tag}{$code}{count}++; - $u->{$tag}{$code}{rcnt}++ unless ($u->{$tag}{$code}{last} == $self->{data}{rptr}); + $u->{$tag}{$code}{rcnt}++ unless ( defined $u->{$tag}{$code}{last} and + $u->{$tag}{$code}{last} == $self->{data}{rptr} ); $u->{$tag}{$code}{last} = $self->{data}{rptr}; return; } + # fetch our datafield struct and fieldname my $dataf = $self->{data}{crec}{tags}[-1]; my $field = $map->field($tag, $code); - # handle modifiers - if (defined $map->mods($field)) { - if ($map->mods($field) eq 'multi') { + # handle modifiers, or slug data in normally + if (my $mods = $map->mods($field)) { + if ($mods->{multi}) { my $name = $tag . $code; push @{$dataf->{multi}{$name}}, $sub->text; } + } else { + die "Multiple occurances of a non-multi field: \n" + if (defined $dataf->{uni}{$code}); + $dataf->{uni}{$code} = $sub->text; } - - $dataf->{uni}{$code} = $sub->text; } +=head1 MODIFIERS + +MapDrivenMARCXMLProc implements the following modifiers, and passes +them to L, meaning that specifying +any other modifiers in a MDMP map file will cause a fatal error when +it is processed. + +=head2 multi + +If a mapping is declared to be C, then MDMP expects to see more +than one instance of that subfield per datafield, and the data is +handled accordingly (see L below). + +Occurring zero or one time is legal for a C mapping. + +A mapping which is not flagged as C, but which occurs more than +once per datafield will cause a fatal error. + +=head2 bib + +=head2 required + =head1 PARSED RECORDS +Given: + + my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS); + $rec = $m->parse_record; + +Then C<$rec> will look like: + { egid => evergreen_record_id, bib => { @@ -206,43 +260,67 @@ key which points to an arrayref. =head3 C -This hashref holds extracted data which should occur once per record -(the default assumption is that a tag/subfield pair can occur multiple -times per record). The keys are composed of tag id and subfield code, -catenated (e.g. 901c). The values are the contents of that subfield of -that tag. +A reference to a hash which holds extracted data which occurs only +once per record (and is therefore "bib-level"; the default assumption +is that a tag/subfield pair can occur multiple times per record). The +keys are composed of tag id and subfield code, catenated +(e.g. 901c). The values are the contents of that subfield of that tag. -If there are no tags defined as bib-level, C will be C. +If there are no tags defined as bib-level in the mapfile, C will +be C. =head3 C -This arrayref holds anonymous hashrefs, one for each instance of each -tag which occurs in the map. Each tag hashref holds its own id -(e.g. C<998>), and two more hashrefs, C and C. +A reference to a list of anonymous hashes, one for each instance of +each tag which occurs in the map. + +Each tag hash holds its own id (e.g. C<998>), and two references to +two more hashrefs, C and C. -The C hashref holds the extracted data for tag/sub mappings -which have the C modifier on them. The keys in C are +The C hash holds the extracted data for tag/sub mappings which +have the C modifier on them. The keys in C are composed of the tag id and subfield code, catenated (e.g. C<901c>). The values are arrayrefs containing the content of all -instances of that subfield in that instance of that tag. +instances of that subfield in that instance of that tag. If no tags +are defined as C, it will be C. -The C hashref holds data for tag/sub mappings which occur only -once per instance of a tag (but may occur multiple times in a record -due to there being multiple instances of that tag in a record). Keys -are subfield codes and values are subfield content. +The C hash holds data for tag/sub mappings which occur only once +per instance of a tag (but may occur multiple times in a record due to +there being multiple instances of that tag in a record). Keys are +subfield codes and values are subfield content. -If no tags are defined as C, it will be C. +All C subfields occuring in the map are guaranteed to be +defined. Sufields which are mapped but do not occur in a particular +datafield will be given a value of '' (the null string) in the current +record struct. Oppose subfields which are not mapped, which will be +C. =head1 UNMAPPED TAGS +If the C argument is passed to L, there will also be a +structure which holds data about unmapped subfields encountered in +mapped tags which are also in the declared sample set. This +information is collected over the life of the object and is not reset +for every record processed (as the current record data neccessarily +is). + { tag_id => { - sub_code => { value => VALUE, count => COUNT }, - sub_code2 => { value => VALUE, count => COUNT }, + sub_code => { value => VALUE, + count => COUNT, + rcnt => RCOUNT + }, ... }, ... } +For each mapped tag, for each unmapped subfield, there is a hash of +data about that subfield containing + + * value - A sample of the subfield text + * count - Total number of times the subfield was seen + * rcnt - The number of records the subfield was seen in + =head1 AUTHOR Shawn Boyette, C<< >>