X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=elect_zips;fp=elect_zips;h=7671bc02901eb830d6304607455afa80a7355ad3;hp=0000000000000000000000000000000000000000;hb=6aa33c3a44b3684b0b176ac0cc743106bd0348b5;hpb=ccd3ad3cdb32e053fd6655ea7340466eecf5f0ff diff --git a/elect_zips b/elect_zips new file mode 100755 index 0000000..7671bc0 --- /dev/null +++ b/elect_zips @@ -0,0 +1,55 @@ +#!/usr/bin/perl -w + +use strict; + +my $VERSION = '1.00'; + +=pod + +NAME + + elect_zips - Utility to elect a winning city/state for each ZIP code based on patron data + +USAGE + + psql -U evergreen -A -t -F $'\t' -c 'SELECT city, state, post_code FROM actor.usr_address' > raw-csz.tsv + elect_zips < raw-csz.tsv > winning-zips.tsv + +NOTES + + Given input like "Miami Springs\tFL\t33166\n" derived from patron addresses, + this utility will print a city and state for each zip that has the maximum + number of occurrences. (It does not attempt to break ties. If there is a tie, + the city and state that reaches the maximum first will end up winning.) + + You can also feed the output of elect_zips directly into I + +=cut + +my %zips; + +# Go through the input and tally the city-state combinations for each ZIP code +while (<>) { + chomp; + (my $city, my $state, my $zip) = split(/\t/) or next; + next unless $zip =~ m/([\d]{5})/; # If it doesn't have 5 digits in a row, it's not a ZIP + $zip =~ s/^([\d]{5}).*$/$1/; # We only want the 5-digit ZIP + $state = uc($state); + $city =~ s/^\s+//; + $city =~ s/\s+$//; + $zips{$zip}{"$city\t$state"}++; +} + +# Pick and print a winner for each ZIP code +foreach(sort keys %zips) { + my $zip = $_; + my $max = 0; + my $citystate = ""; + foreach(keys %{$zips{$zip}}) { + if ($zips{$zip}{$_} > $max) { + $max = $zips{$zip}{$_}; + $citystate = $_; + } + } + print "$citystate\t$zip\n"; +}