From: Ben Ostrowsky Date: Wed, 11 May 2011 17:07:35 +0000 (-0400) Subject: Two new utilities that, together, can generate a zips.txt file customized for your... X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=commitdiff_plain;h=6aa33c3a44b3684b0b176ac0cc743106bd0348b5 Two new utilities that, together, can generate a zips.txt file customized for your patron database. --- diff --git a/elect_zips b/elect_zips new file mode 100755 index 0000000..7671bc0 --- /dev/null +++ b/elect_zips @@ -0,0 +1,55 @@ +#!/usr/bin/perl -w + +use strict; + +my $VERSION = '1.00'; + +=pod + +NAME + + elect_zips - Utility to elect a winning city/state for each ZIP code based on patron data + +USAGE + + psql -U evergreen -A -t -F $'\t' -c 'SELECT city, state, post_code FROM actor.usr_address' > raw-csz.tsv + elect_zips < raw-csz.tsv > winning-zips.tsv + +NOTES + + Given input like "Miami Springs\tFL\t33166\n" derived from patron addresses, + this utility will print a city and state for each zip that has the maximum + number of occurrences. (It does not attempt to break ties. If there is a tie, + the city and state that reaches the maximum first will end up winning.) + + You can also feed the output of elect_zips directly into I + +=cut + +my %zips; + +# Go through the input and tally the city-state combinations for each ZIP code +while (<>) { + chomp; + (my $city, my $state, my $zip) = split(/\t/) or next; + next unless $zip =~ m/([\d]{5})/; # If it doesn't have 5 digits in a row, it's not a ZIP + $zip =~ s/^([\d]{5}).*$/$1/; # We only want the 5-digit ZIP + $state = uc($state); + $city =~ s/^\s+//; + $city =~ s/\s+$//; + $zips{$zip}{"$city\t$state"}++; +} + +# Pick and print a winner for each ZIP code +foreach(sort keys %zips) { + my $zip = $_; + my $max = 0; + my $citystate = ""; + foreach(keys %{$zips{$zip}}) { + if ($zips{$zip}{$_} > $max) { + $max = $zips{$zip}{$_}; + $citystate = $_; + } + } + print "$citystate\t$zip\n"; +} diff --git a/enrich_zips b/enrich_zips new file mode 100755 index 0000000..28587df --- /dev/null +++ b/enrich_zips @@ -0,0 +1,72 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long; + +my $VERSION = '1.01'; + +=pod + +NAME + + enrich_zips - Utility to add county information to city/state/ZIP data (and optionally use zips.txt format) + +USAGE + + enrich_zips --db US.txt < citystatezip.tsv + enrich_zips --makezips --db US.txt < citystatezip.tsv > zips.txt + +NOTES + + Geonames database can be downloaded from http://download.geonames.org/export/zip/US.zip + +=cut + +my ($db, $makezips, %zips, $warn); +my $result = GetOptions ("db=s" => \$db, + "makezips" => \$makezips, + "warn" => \$warn); +die + "Please specify the location of the Geonames database with --db US.txt\n" . + "HINT: You can download it at http://download.geonames.org/export/zip/US.zip" . + "\n\nProgram halted" +unless defined($db); + +open DB, $db or die "Couldn't open Geonames database $db: $!\n"; + +# Slurp in the Geonames database +while () { + chomp; + my @f = split(/\t/); + @{$zips{$f[1]}} = @f[4,2,5]; + # @{$zips{"33166"}} == ("FL", "Miami Springs", "Miami-Dade") +} + +while (<>) { + + chomp; + (my $city, my $state, my $zip) = split(/\t/) or next; + my $county = ""; + my ($dbcity, $dbstate); + + if (defined $zips{$zip}) { + ($dbstate, $dbcity, $county) = @{$zips{$zip}}; + } + + if ($warn) { + if (!defined $zips{$zip}) { + print STDERR "No county data found for ZIP code $zip ($city, $state)\n\n"; + next; + } + if ($city ne $dbcity || $state ne $dbstate) { + print STDERR "Patron data input says $zip is $city, $state\n". + "Geonames database says $zip is $dbcity, $dbstate\n\n"; + } + } + + if ($makezips) { + print "|" . join("|", ($state, $city, $zip, "1", "", $county)) . "||\n"; + } else { + print join("\t", ($city, $state, $zip, $county)) . "\n"; + } +}