X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=fingerprinter;h=6e04e0ba89cf4eb08983896c79f2ff01755dd0ee;hp=ddf3db89c2dcefe07830f8ca48124961a4572959;hb=a09dbe656fda90a77c36fd3093f99e7f26f4f480;hpb=94a0102ed53ccb07f0183944a89cf60c340bc4f5 diff --git a/fingerprinter b/fingerprinter index ddf3db8..6e04e0b 100755 --- a/fingerprinter +++ b/fingerprinter @@ -7,6 +7,7 @@ use Getopt::Long; use MARC::Batch; use Unicode::Normalize; use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use Equinox::Migration::SubfieldMapper; my $conf = {}; # configuration hashref my $count = 0; my $scount = 0; @@ -41,6 +42,8 @@ for my $file (@ARGV) { # populate and normalize marc $marc = populate_marc($record, $id); + # check for manual exclusion + next if this_record_is_excluded($record, $marc); normalize_marc($marc); unless (marc_isvalid($marc)) { dump_exception($marc); next; } @@ -240,15 +243,16 @@ sub score_marc { if (defined $marc->{tag300a} and $marc->{tag300a} =~ /copy/i); # subtract record id if we want older records to win - $marc->{age_score} -= $marc->{id} unless ($conf->{newwins}); + #$marc->{age_score} -= $marc->{id} unless ($conf->{newwins}); # handle arbitrary adjustments - if ($conf->{'arbitrarily-decrease-score-above'}) { - $marc->{age_score} -= $conf->{'arbitrarily-decrease-score-by'} - if ($marc->{id} >= $conf->{'arbitrarily-decrease-score-above'}); + $marc->{age_score} = 1; + if ($conf->{'arbitrarily-lose-above'}) { + $marc->{age_score} = 0 + if ($marc->{id} >= $conf->{'arbitrarily-lose-above'}); } - if ($conf->{'arbitrarily-decrease-score-below'}) { - $marc->{age_score} -= $conf->{'arbitrarily-decrease-score-by'} - if ($marc->{id} <= $conf->{'arbitrarily-decrease-score-below'}); + if ($conf->{'arbitrarily-lose-below'}) { + $marc->{age_score} = 0 + if ($marc->{id} <= $conf->{'arbitrarily-lose-below'}); } #---------------------------------- @@ -287,7 +291,7 @@ sub score_marc { } $json .= 'misc:' . $marc->{misc_score} . '}'; - my $compact = join('', $marc->{misc_score}, @score, $marc->{age_score}); + my $compact = join('', $marc->{age_score}, $marc->{misc_score}, @score); $marc->{score} = "$compact\t$json"; } @@ -373,12 +377,18 @@ Write line of exception report =cut sub dump_exception { - my ($marc) = @_; + my ($marc, $msg) = @_; unless (defined $marc) { print XF "Undefined record at line $count; likely bad XML\n"; return; } + print XF "Record ", $marc->{id}, " excluded: "; + if (defined $msg) { + print XF "$msg\n"; + return + } + print XF "missing item_form; " unless ($marc->{item_form}); unless (defined $marc->{date1}) { print XF "missing date1; " } @@ -392,6 +402,42 @@ sub dump_exception { } +=head2 this_record_is_excluded + +Returns 1 if the record B and 0 if the record B excluded, +according to the subfield mapping (generated via the C<--excludelist> +option). + +=cut + +sub this_record_is_excluded { + my ($rec, $marc) = @_; + return 0 unless defined $conf->{excludelist}; + + for my $tag (keys %{ $conf->{excludelist}->{tags} }) { + for my $sub (keys %{$conf->{excludelist}->{tags}{$tag}}) { + my $f = $conf->{excludelist}->field($tag, $sub); + + # if this record doesn't have the right tag/sub, it can't be + return 0 unless ($rec->field($tag) and $rec->field($tag)->subfield($sub)); + # but it does, so if there are no filters to check... + unless ($conf->{excludelist}->filters($f)) + { dump_exception($marc, "exclusion $tag$sub"); return 1 } + + my $sub_contents = $rec->field($tag)->subfield($sub); + for my $filter (@{ $conf->{excludelist}->filters($f)}) { + if ($sub_contents =~ /$filter/i) { + # filter matches. no fp. + dump_exception($marc, "exclusion $tag$sub '$filter'"); + return 1; + } + # no match, no exclude + return 0; + } + } + } +} + =head2 initialize Performs boring script initialization. Handles argument parsing, @@ -415,10 +461,10 @@ sub initialize { 'tag|t=s', 'fingerprints=s', 'scores=s', - 'arbitrarily-decrease-score-above=i', - 'arbitrarily-decrease-score-below=i', - 'arbitrarily-decrease-score-by=i', + 'arbitrarily-lose-above=i', + 'arbitrarily-lose-below=i', 'newwins', + 'excludelist=s', 'quiet|q', 'help|h', ); @@ -464,13 +510,17 @@ sub initialize { $c->{tag} = 903 unless defined $c->{tag}; $c->{subfield} = 'a' unless defined $c->{subfield}; $c->{marctype} = 'XML' unless defined $c->{marctype}; - $c->{'arbitrarily-decrease-score-by'} = 0 - unless defined $c->{'arbitrarily-decrease-score-by'}; if ($c->{prefix}) { $c->{output} = join('.',$c->{prefix},'fp'); $c->{exception} = join('.',$c->{prefix},'fp','ex'); } + # get SFM object if excludelist was specified + if ($c->{excludelist}) { + $c->{excludelist} = + Equinox::Migration::SubfieldMapper->new( file => $c->{excludelist} ); + } + my @keys = keys %{$c}; show_help() unless (@ARGV and @keys); for my $key ('tag', 'subfield', 'output', 'exception') @@ -516,16 +566,16 @@ Options --fingerprints=LIST Fingerprints to generate, comma separated Default: oclc,isbn,edition,issn,lccn,accomp,authpub Others: baseline + --excludelist=FILE Name of fingerprints exclusions file --scores=LIST Scores to calculate, comma separated Default: oclc,dlc,num_650,num_tags,enc_level --newwins New record IDs score higher (default is old wins) - --arbitrarily-decrease-score-above - --arbitrarily-decrease-score-below + --arbitrarily-lose-above + --arbitrarily-lose-below --arbitrarily-decrease-score-by Modify fingerprint scoring of records whose EG id is above or below a - given value, inclusive (so 5 is <= 5 or >= 5). -by gives the amount by - which to adjust the score. + given value, inclusive (so 5 is <= 5 or >= 5) such that they lose. --marctype=TYPE Defaults to 'XML' HELP