From f9201dc2d1699f5161e5e29690a1634e8063bb85 Mon Sep 17 00:00:00 2001 From: Jason Etheridge Date: Fri, 7 Apr 2017 08:31:31 -0400 Subject: [PATCH] "mig" tool Wraps some new and existing tools in a git-like command system, currently great for quickly getting files into staging tables, has aspirations for one day tracking mapping relationships and doing code generation for mapping. Signed-off-by: Jason Etheridge --- mig | 345 ++++++++++ mig-bin/Mig.pm | 261 ++++++++ mig-bin/mig-add | 127 ++++ mig-bin/mig-clean | 127 ++++ mig-bin/mig-convert | 131 ++++ mig-bin/mig-env | 244 +++++++ mig-bin/mig-iconv | 107 +++ mig-bin/mig-init | 121 ++++ mig-bin/mig-link | 87 +++ mig-bin/mig-mapper | 778 ++++++++++++++++++++++ mig-bin/mig-quicksheet | 286 ++++++++ mig-bin/mig-remove | 67 ++ mig-bin/mig-skip-clean | 98 +++ mig-bin/mig-skip-iconv | 85 +++ mig-bin/mig-stage | 128 ++++ mig-bin/mig-status | 87 +++ mig-bin/mig-unlink | 71 ++ text/clean_csv | 792 +++++++++++++++++++++++ text/clean_csv.prereqs | 2 + text/csv2sql | 161 +++++ text/csv2sql.prereqs | 1 + text/csvcat | 71 ++ text/examples/FolletDestinyPatronExport.headers | 37 ++ text/examples/HOLDINGS-MULT.headers | 3 + text/examples/csv.clean.conf | 3 + text/examples/destiny_852.map | 12 + text/examples/horizon_949.map | 24 + text/examples/tabs.clean.conf | 3 + text/fix_split_csv | 20 + text/html2csv.py | 139 ++++ text/html2tsv.py | 139 ++++ text/join_lines | 100 +++ text/join_lines.one-off.001 | 6 + text/join_lines_if_short | 87 +++ text/split_body_from_headers | 13 + text/strip_cm_tabs | 6 + 36 files changed, 4769 insertions(+), 0 deletions(-) create mode 100755 mig create mode 100644 mig-bin/Mig.pm create mode 100755 mig-bin/mig-add create mode 100755 mig-bin/mig-clean create mode 100755 mig-bin/mig-convert create mode 100755 mig-bin/mig-env create mode 100755 mig-bin/mig-iconv create mode 100755 mig-bin/mig-init create mode 100755 mig-bin/mig-link create mode 100755 mig-bin/mig-mapper create mode 100755 mig-bin/mig-quicksheet create mode 100755 mig-bin/mig-remove create mode 100755 mig-bin/mig-skip-clean create mode 100755 mig-bin/mig-skip-iconv create mode 100755 mig-bin/mig-stage create mode 100755 mig-bin/mig-status create mode 100755 mig-bin/mig-unlink create mode 100755 text/clean_csv create mode 100644 text/clean_csv.prereqs create mode 100755 text/csv2sql create mode 100644 text/csv2sql.prereqs create mode 100755 text/csvcat create mode 100644 text/examples/FolletDestinyPatronExport.headers create mode 100644 text/examples/HOLDINGS-MULT.headers create mode 100644 text/examples/csv.clean.conf create mode 100644 text/examples/destiny_852.map create mode 100644 text/examples/horizon_949.map create mode 100644 text/examples/tabs.clean.conf create mode 100755 text/fix_split_csv create mode 100755 text/html2csv.py create mode 100755 text/html2tsv.py create mode 100755 text/join_lines create mode 100755 text/join_lines.one-off.001 create mode 100755 text/join_lines_if_short create mode 100755 text/split_body_from_headers create mode 100755 text/strip_cm_tabs diff --git a/mig b/mig new file mode 100755 index 0000000..ea6cfd1 --- /dev/null +++ b/mig @@ -0,0 +1,345 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig - git-like program for tracking and manipulating legacy data files for +migrations + +=head1 SYNOPSIS + +B [argument] [...] + +=head1 DESCRIPTION + +B is used to track and manipulate CSV or CSV-like text files exported from +legacy systems for migration into Evergreen. It can be a wrapper for some +other migration tools and tracks state using a PostgreSQL table in a given +migration schema. + +It makes use of certain environment variables that may be set by the B +tool: PGHOST, PGPORT, PGUSER, PGDATABASE, MIGSCHEMA, and MIGWORKDIR + +For most commands, if the current working directory falls outside of the +directory specified by MIGWORKDIR, then mig will assume that environment is +also incorrect and bail before doing any actual work. + +~/.pgpass should also be configured, as B will not prompt for a database +password. + +Only the B and B commands work without the MIGSCHEMA environment +variable being set. + +=head1 OVERVIEW + +Using B should go something like this: + +=over 15 + +=item mig env create m_foo # Sets up the environment + +=item mig env use m_foo # Spawns a shell using the configured environment + +=item mig init # creates the m_foo schema in the database if needed, and other tables + +=item mig add patrons.tsv # tracks an incoming data file; repeat for additional files + +=item mig iconv patrons.tsv # convert it to UTF8, creating patrons.tsv.utf8 + +=item mig clean patrons.tsv # cleans the file, creating patrons.tsv.utf8.clean + +=item mig link patrons.tsv actor_usr # makes the soon-to-be staging table a child of m_foo.actor_usr + +=item mig convert patrons.tsv # creates a .sql file for staging the data + +=item mig stage patrons.tsv # load said .sql file + +=item mig mapper patrons.tsv # interactive tool for analyzing/mapping the staging table + +=item mig analysis patrons.tsv # writes a summary .tsv file of mapped/flagged fields from the staging table + +=item mig map patrons.tsv # apply configured mappings + +=item mig write_prod patrons.tsv # creates a .sql file for pushing the staging data into production + +=back + +=head1 COMMANDS + +=over 15 + +=item B [command] + +Display this very same documentation, or specific documentation for one of the +commands listed here. + +=item B + +Invokes B with the same arguments. I can set important +environment variables and spawn a shell with those variables, and it also does +some directory creation and symlinking. + +=item B + +Create or re-create the PostgreSQL tracking table for the schema specified by +the MIGSCHEMA environment variable. If needed, create the migration schema +itself and run migration_tools.init() and build() if the migration_tools schema +exists. + +=item B [file] [...] + +Show status information for either the specified files or all tracked files if +no argument is given. + +=item B [--no-headers|--headers] [file|--no-headers|--headers] [...] + +Add the specified files to the migration tracker. Until --no-headers is +specified, the tracker will assume the files have headers. + +You can do crazy stuff like +B + +=item B [file] [...] + +Remove the specified files from the migration tracker. + +=item B [other arguments...] + +Attempts to invoke B on the specified tracked file, placing the output in +.utf8 + +If given no other arguments, the invocation will lool like + +=over 5 + +iconv -f ISO-8859-1 -t UTF-8 -o .utf8 + +=back + +otherwise, the arguments will be passed through like so + +=over 5 + +iconv [other arguments...] -o .utf8 + +=back + +=item B + +If this is used instead of B, then B will look for an existing +.utf8 and use it instead of attempting to create one. + +=item B [other arguments...] + +Attempts to invoke B on the iconv-converted specified tracked file, +placing the output in .utf8.clean + +If given no other arguments, the invocation will lool like + +=over 5 + +clean_csv --config scripts/clean.conf --fix --apply <--create-headers> + +=back + +otherwise, the arguments will be passed through like so + +=over 5 + +clean_csv [other arguments...] + +=back + +=item B + +If this is used instead of B, then B will look for an existing +.utf8.clean and use it instead of attempting to create one. + +=item B + +Associate the specified file with a parent table within the migration schema. + +Linking multiple files to the same parent table is not allowed currently. + +=item B + +Removes any association between the specified file and a parent table within +the migration schema. + +=item B + +Attempts to invoke B on the .utf8.clean version of the specified +tracked file, creating either [file].utf8.clean.stage.sql or +_stage.sql depending on whether the file has been linked to a +parent table within the migration schema or not. + +If given no other arguments, the invocation will lool like + +=over 5 + +csv2sql --config scripts/clean.conf --add-x-migrate --schema [--parent ] -o <[.utf8.clean.stage.sql]|[parent_table_stage.sql]> .utf8.clean + +=back + +otherwise, the arguments will be passed through like so + +=over 5 + +csv2sql [other arguments...] -o <[.utf8.clean.stage.sql]|[parent_table_stage.sql]> .utf8.clean + +=back + +=item B [other arguments...] + +Load the SQL-converted version of the specified file into the migration schema. + +Extra arguments are passed to the underlying call to psql + +=item B + +Interactive session for analyzing, flagging, and mapping legacy field data to +Evergreen fields. + +Upon exit, generate either [file].clean.map.sql or _map.sql. The +SQL generated will be UPDATE's for setting the Evergreen-specific columns for a +given file's staging tables, and TRUNCATE's and INSERT's for auxilary tables. +The files will have \include hooks for pulling in additional mapping files +(for example, end-user mappings for circ modifiers, etc.) + +=item B [file] + +Writes a MIGSCHEMA.tsv file containing a break-down of mapped and flagged +fields from the specified file, or all staged files if no file is specified. + +The main goal of the tsv file is to present end-user mappable data for circ +modifiers, shelving locations, patron profiles, etc. We use spreadsheets for +this now but may move to a dedicated UI in the future. + +=item B [file] + +Applies the mapping sql to the migration schema for the specified mapped file, +or for all mapped files if no file is specified. + +=item B [file] + +Generates _prod.sql for the specified linked and mapped file, or +all such files if no file is specified. + +=back + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use FindBin; +my $mig_bin = "$FindBin::Bin/mig-bin/"; +use lib "$FindBin::Bin/mig-bin"; +use Mig; + +pod2usage(-verbose => 2) if ! $ARGV[0]; +switch($ARGV[0]) { + case "help" { + if (defined $ARGV[1]) { + my $cmd = $mig_bin . "mig-$ARGV[1]"; + if (-e $cmd) { + system( $mig_bin . "mig-$ARGV[1]", '--help' ); + } else { + pod2usage(-verbose => 2); + } + } else { + pod2usage(-verbose => 2); + } + } + case "env" { + standard_invocation(@ARGV); + } + case "init" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "status" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "add" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "remove" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "iconv" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "skip-iconv" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "clean" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "skip-clean" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "link" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "unlink" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "convert" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "stage" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "mapper" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "quicksheet" { + Mig::die_if_no_env_migschema(); + standard_invocation(@ARGV); + } + case "map" { + Mig::die_if_no_env_migschema(); + } + case "load" { + Mig::die_if_no_env_migschema(); + } + case "wdir" { + print "$MIGWORKDIR\n"; + } + case "gdir" { + print "$MIGBASEGITDIR\n"; + } + case "sdir" { + print "$MIGGITDIR\n"; + } + else { + pod2usage(1); + } +} + +sub standard_invocation { + my $cmd = shift; + system( $mig_bin . "mig-$cmd", @_ ); +} + + diff --git a/mig-bin/Mig.pm b/mig-bin/Mig.pm new file mode 100644 index 0000000..6600cfd --- /dev/null +++ b/mig-bin/Mig.pm @@ -0,0 +1,261 @@ +package Mig; + +use strict; +use Exporter; +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); + +$VERSION = 1.00; +@ISA = qw(Exporter); +@EXPORT = (); +@EXPORT_OK = qw(); +%EXPORT_TAGS = ( + DEFAULT => [] +); + +use DBI; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); + +sub db_connect { + my $dbh = DBI->connect( + "dbi:Pg:host=$PGHOST;dbname=$PGDATABASE;port=$PGPORT" + ,$PGUSER + ,undef + ) || die "Unable to connect to $PGHOST:$PGPORT:$PGDATABASE:$PGUSER : $!\n"; + return $dbh; +} +sub db_disconnect { + my $dbh = shift; + $dbh->disconnect; +} + +sub sql { + my $sql = shift; + chomp $sql; + $sql =~ s/\n//g; + print "\n$sql\n"; + return $sql; +} + +sub die_if_no_env_migschema { + die "MIGSCHEMA environment variable not set. See 'mig env help'\n" + unless $MIGSCHEMA; +} + +sub check_for_db_migschema { + my $dbh = db_connect(); + my $sth = $dbh->prepare(" + SELECT EXISTS( + SELECT 1 + FROM pg_namespace + WHERE nspname = ? + );" + ); + my $rv = $sth->execute($MIGSCHEMA) + || die "Error checking for migration schema ($MIGSCHEMA): $!"; + my @cols = $sth->fetchrow_array; + $sth->finish; + my $found; + if ($cols[0]) { + print "Found migration schema ($MIGSCHEMA) at $PGHOST:$PGPORT:$PGDATABASE:$PGUSER\n"; + $found = 1; + } else { + print "Migration schema ($MIGSCHEMA) does not exist at $PGHOST:$PGPORT:$PGDATABASE:$PGUSER\n"; + $found = 0; + } + db_disconnect($dbh); + return $found; +} + +sub check_db_migschema_for_migration_tables { + my $found = check_db_migschema_for_specific_table('asset_copy'); + if (!$found) { + print "Missing migration tables (such as $MIGSCHEMA.asset_copy)\n"; + } + return $found; +} + +sub check_db_migschema_for_specific_table { + my $table = shift; + my $dbh = db_connect(); + my $sth = $dbh->prepare(" + SELECT EXISTS( + SELECT 1 + FROM information_schema.tables + WHERE table_schema = " . $dbh->quote( $MIGSCHEMA ) . " + AND table_name = " . $dbh->quote( $table ) . " + );" + ); + my $rv = $sth->execute() + || die "Error checking migration schema ($MIGSCHEMA) for table ($table): $!"; + my @cols = $sth->fetchrow_array; + $sth->finish; + my $found; + if ($cols[0]) { + $found = 1; + } else { + $found = 0; + } + db_disconnect($dbh); + return $found; +} + +sub check_for_migration_tools { + my $dbh = db_connect(); + my $sth = $dbh->prepare(" + SELECT EXISTS( + SELECT 1 + FROM pg_namespace + WHERE nspname = 'migration_tools' + );" + ); + my $rv = $sth->execute() + || die "Error checking for migration_tools schema: $!"; + my @cols = $sth->fetchrow_array; + $sth->finish; + db_disconnect($dbh); + return $cols[0]; +} + +sub die_if_no_migration_tools { + if (check_for_migration_tools()) { + print "Found migration_tools schema\n"; + } else { + die "Missing migration_tools schema\n"; + } +} + +sub check_for_mig_tracking_table { + my $dbh = db_connect(); + my $sth = $dbh->prepare(" + SELECT EXISTS( + SELECT 1 + FROM information_schema.tables + WHERE table_schema = " . $dbh->quote( $MIGSCHEMA ) . " + AND table_name = 'tracked_file' + );" + ); + my $rv = $sth->execute() + || die "Error checking for table (tracked_file): $!"; + my @cols = $sth->fetchrow_array; + $sth->finish; + db_disconnect($dbh); + return $cols[0]; +} + +sub die_if_mig_tracking_table_exists { + if (check_for_mig_tracking_table()) { + die "Table $MIGSCHEMA.tracked_file already exists. Bailing init...\n"; + } +} + +sub die_if_mig_tracking_table_does_not_exist { + if (!check_for_mig_tracking_table()) { + die "Table $MIGSCHEMA.tracked_file does not exist. Bailing...\n"; + } +} + +sub check_for_mig_column_tracking_table { + my $dbh = db_connect(); + my $sth = $dbh->prepare(" + SELECT EXISTS( + SELECT 1 + FROM information_schema.tables + WHERE table_schema = " . $dbh->quote( $MIGSCHEMA ) . " + AND table_name = 'tracked_column' + );" + ); + my $rv = $sth->execute() + || die "Error checking for table (tracked_column): $!"; + my @cols = $sth->fetchrow_array; + $sth->finish; + db_disconnect($dbh); + return $cols[0]; +} + +sub die_if_mig_column_tracking_table_exists { + if (check_for_mig_column_tracking_table()) { + die "Table $MIGSCHEMA.tracked_column already exists. Bailing init...\n"; + } +} + +sub die_if_mig_column_tracking_table_does_not_exist { + if (!check_for_mig_column_tracking_table()) { + die "Table $MIGSCHEMA.tracked_column does not exist. Bailing...\n"; + } +} + +sub check_for_tracked_file { + my $file = shift; + my $options = shift; + if (! -e $file) { + die "file not found: $file\n" unless $options && $options->{'allow_missing'}; + } + my $dbh = db_connect(); + my $sth = $dbh->prepare(" + SELECT id + FROM $MIGSCHEMA.tracked_file + WHERE base_filename = " . $dbh->quote( $file ) . ";" + ); + my $rv = $sth->execute() + || die "Error checking table (tracked_file) for base_filename ($file): $!"; + my @cols = $sth->fetchrow_array; + $sth->finish; + db_disconnect($dbh); + return $cols[0]; +} + +sub check_for_tracked_column { + my ($table,$column,$options) = (shift,shift,shift); + my $dbh = db_connect(); + my $sth = $dbh->prepare(" + SELECT id + FROM $MIGSCHEMA.tracked_column + WHERE staged_table = " . $dbh->quote( $table ) . " + AND staged_column = " . $dbh->quote( $column ) . ";" + ); + my $rv = $sth->execute() + || die "Error checking table (tracked_column) for $table.$column: $!"; + my @cols = $sth->fetchrow_array; + $sth->finish; + db_disconnect($dbh); + return $cols[0]; +} + +sub status_this_file { + my $file = shift; + my $dbh = db_connect(); + my $sth = $dbh->prepare(" + SELECT * + FROM $MIGSCHEMA.tracked_file + WHERE base_filename = " . $dbh->quote( $file ) . ";" + ); + my $rv = $sth->execute() + || die "Error retrieving data from table (tracked_file) for base_filename ($file): $!"; + my $data = $sth->fetchrow_hashref; + $sth->finish; + db_disconnect($dbh); + return $data; +} + +sub status_this_column { + my ($table,$column) = (shift,shift); + my $dbh = db_connect(); + my $sth = $dbh->prepare(" + SELECT * + FROM $MIGSCHEMA.tracked_column + WHERE staged_table = " . $dbh->quote( $table ) . " + AND staged_column = " . $dbh->quote( $column ) . ";" + ); + my $rv = $sth->execute() + || die "Error checking table (tracked_column) for $table.$column: $!"; + my $data = $sth->fetchrow_hashref; + $sth->finish; + db_disconnect($dbh); + return $data; +} + +1; + diff --git a/mig-bin/mig-add b/mig-bin/mig-add new file mode 100755 index 0000000..3e433c5 --- /dev/null +++ b/mig-bin/mig-add @@ -0,0 +1,127 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-add - This will add the specified files to the mig tracking table for the +schema pointed to by the MIGSCHEMA environment variable in the PostgreSQL +database specified by various PG environment variables. + +--headers (the default) and --no-headers are repeatable, and indicate whether +subsequent files have headers or not + +--headers-file specifies a text file defining the column headers for +the next added , which should contain one line per header + +--headers-file will automatically invoke --no-headers + +You'll need to invoke B prior to using commands like B + +=head1 SYNOPSIS + +B [--no-headers|--headers|--headers-file ] [file|--no-headers|--headers|--headers-file ] [...] + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +my $has_headers = 1; +my $headers_file; +my $next_arg_is_headers_file = 0; + +foreach my $arg (@ARGV) { + if ($next_arg_is_headers_file) { + $next_arg_is_headers_file = 0; + $headers_file = abs_path($arg); + next; + } + if ($arg eq '--headers') { + $has_headers = 1; + next; + } + if ($arg eq '--no-headers') { + $has_headers = 0; + next; + } + if ($arg eq '--headers-file') { + $next_arg_is_headers_file = 1; + $has_headers = 0; + next; + } + my $file = abs_path($arg); + if ($file =~ /^$MIGBASEWORKDIR/) { + if (-e $file) { + if (-f $file) { + add_this_file($file,$has_headers,$headers_file); + $headers_file = ''; # clear after applying to just one file + } else { + print "Not a real file: $file\n"; + } + } else { + print "Could not find file: $file\n"; + } + } else { + print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n"; + } +} + +exit 0; + +############################################################################### + +sub add_this_file { + my $file = shift; + my $headers = shift; + my $headers_file = shift; + if ($headers_file) { + if (! (-e $headers_file && -f $headers_file)) { + print "Could not find headers file $headers_file, skipping $file\n"; + return; + } + } + if (Mig::check_for_tracked_file($file)) { + print "File already tracked: $file\n"; + } else { + print 'Adding ('; + if ($headers_file) { + print "with headers file = $headers_file"; + } else { + print ($headers ? ' with headers' : 'without headers'); + } + print '): ' . "$file\n"; + my $dbh = Mig::db_connect(); + my $rv = $dbh->do(" + INSERT INTO $MIGSCHEMA.tracked_file ( + base_filename + ,has_headers + ,headers_file + ) VALUES ( + " . $dbh->quote($file) . " + ," . $dbh->quote($headers) . " + ," . $dbh->quote($headers_file) . " + ); + ") || die "Error inserting into table $MIGSCHEMA.tracked_file: $!\n"; + Mig::db_disconnect($dbh); + } +} + diff --git a/mig-bin/mig-clean b/mig-bin/mig-clean new file mode 100755 index 0000000..b9cb013 --- /dev/null +++ b/mig-bin/mig-clean @@ -0,0 +1,127 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-clean + +Attempts to invoke B on the specified tracked file, placing the +output in [file].clean + +If given no other arguments, the invocation will lool like + +=over 5 + +clean_csv --config scripts/clean.conf --fix --apply [--create-headers|--use-headers ] + +=back + +otherwise, the arguments will be passed through like so + +=over 5 + +clean_csv [other arguments...] + +=back + +You'll need to invoke B or B prior to using commands +like B + +=head1 SYNOPSIS + +B [other arguments...] + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +my $file = abs_path($ARGV[0]); +if ($file =~ /^$MIGBASEWORKDIR/) { + call_clean_csv(@ARGV); +} else { + print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n"; +} + +exit 0; + +############################################################################### + +sub call_clean_csv { + my $file = abs_path(shift); + my @args = @_; + + my $tracked_file_id = Mig::check_for_tracked_file($file); + if ($tracked_file_id) { + my $data = Mig::status_this_file($file); + + if (! $data->{'utf8_filename'}) { + die "mig-iconv or mig-skip-iconv needed for UTF8 version of file: $file\n"; + } + + my $utf8_file = $data->{'utf8_filename'}; + if (! -e $utf8_file) { + die "missing file: $utf8_file\n"; + } + + print "cleaning tracked file: $file\n"; + + if (scalar(@args) == 0) { + @args = ( + '--config' + ,'scripts/clean.conf' + ,'--fix' + ,'--apply' + ,'--backslash' + ,'--pad' + ); + if (! $data->{'has_headers'}) { + if ($data->{'headers_file'}) { + push @args, '--use-headers'; + push @args, $data->{'headers_file'}; + } else { + push @args, '--create-headers'; + } + } + } + + print join(' ',@args) . "\n"; + system('clean_csv', @args, $utf8_file); + + my $dbh = Mig::db_connect(); + my $clean_file = $dbh->quote($utf8_file . '.clean'); + if (! -e $utf8_file . '.clean') { + print "clean file does not exist: $clean_file\n"; + $clean_file = $dbh->quote(''); + } + + my $rv = $dbh->do(" + UPDATE $MIGSCHEMA.tracked_file + SET clean_filename = $clean_file + WHERE base_filename = " . $dbh->quote($file) . " + ; + ") || die "Error inserting into table $MIGSCHEMA.tracked_file: $!\n"; + Mig::db_disconnect($dbh); + } else { + print "File not currently tracked: $file\n"; + } +} diff --git a/mig-bin/mig-convert b/mig-bin/mig-convert new file mode 100755 index 0000000..6fe2172 --- /dev/null +++ b/mig-bin/mig-convert @@ -0,0 +1,131 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-convert + +Attempts to invoke B on the .utf8.clean version of the specified +tracked file, creating either [file].utf8.clean.stage.sql or +_stage.sql depending on whether the file has been linked to a +parent table within the migration schema or not. + +If given no other arguments, the invocation will lool like + +=over 5 + +csv2sql --config scripts/clean.conf --add-x-migrate --schema [--parent ] --outfile <[.utf8.clean.stage.sql]|[parent_table_stage.sql]> .utf8.clean + +=back + +otherwise, the arguments will be passed through like so + +=over 5 + +csv2sql [other arguments...] --schema [--parent ] --outfile <[.utf8.clean.stage.sql]|[parent_table_stage.sql]> .utf8.clean + +=back + +=head1 SYNOPSIS + +B [other arguments...] + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +my $file = abs_path($ARGV[0]); +if ($file =~ /^$MIGBASEWORKDIR/) { + call_convert_csv(@ARGV); +} else { + print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n"; +} + +exit 0; + +############################################################################### + +sub call_convert_csv { + my $file = abs_path(shift); + my @args = @_; + + my $stage_sql_filename; + my $tracked_file_id = Mig::check_for_tracked_file($file); + if ($tracked_file_id) { + my $data = Mig::status_this_file($file); + + if (! $data->{'utf8_filename'}) { + die "mig-iconv or mig-skip-iconv needed for UTF8 version of file: $file\n"; + } + + if (! $data->{'clean_filename'}) { + die "mig-clean or mig-skip-clean needed for .clean version of file: $file\n"; + } + + my $clean_file = $data->{'clean_filename'}; + if (! -e $clean_file) { + die "missing file: $clean_file\n"; + } + + print "converting tracked file: $file\n"; + + if (scalar(@args) == 0) { + @args = ( + '--config' + ,'scripts/clean.conf' + ,'--add-x-migrate' + ); + } + push @args, '--use-no-headers-file'; + push @args, '--schema'; + push @args, $MIGSCHEMA; + if ($data->{'parent_table'}) { + push @args, '--parent'; + push @args, $data->{'parent_table'}; + $stage_sql_filename = $data->{'parent_table'} . '.stage.sql'; + } else { + $stage_sql_filename = "$clean_file.stage.sql"; + } + push @args, '--outfile'; + push @args, $stage_sql_filename; + + print "args: " . join(',',@args) . "\n"; + system('csv2sql', @args, $clean_file); + + my $dbh = Mig::db_connect(); + if (! -e $stage_sql_filename) { + print "SQL converted file does not exist: $stage_sql_filename\n"; + $stage_sql_filename = ''; + } + + my $rv = $dbh->do(" + UPDATE $MIGSCHEMA.tracked_file + SET stage_sql_filename = " . $dbh->quote($stage_sql_filename) . " + WHERE base_filename = " . $dbh->quote($file) . " + ; + ") || die "Error updating table $MIGSCHEMA.tracked_file: $!\n"; + Mig::db_disconnect($dbh); + } else { + print "File not currently tracked: $file\n"; + } +} diff --git a/mig-bin/mig-env b/mig-bin/mig-env new file mode 100755 index 0000000..eddfc27 --- /dev/null +++ b/mig-bin/mig-env @@ -0,0 +1,244 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-env - This tool is for tracking and setting environment variables used by +B and its sub-tools. + +=head1 SYNOPSIS + +B + +B [migration_schema] + +B + +B + +=head1 DESCRIPTION + +For most invocations, B will either create or use a migration-specific +file (~/.mig/.env) for setting the following environment +variables: + +=over 15 + +=item MIGSCHEMA + +The name of the migration schema. Convention has this being a single lowercased +word or acronym identifying the library, prefixed with 'm_'. + +=item MIGWORKDIR + +The base working directory for containing migration data, scripts, and other +files. + +=item PGHOST + +The IP address or hostname for the PostgreSQL database used for a migration. + +=item PGPORT + +The TCP port for the PostgreSQL database. + +=item PGUSER + +The PostgreSQL user to use for the database. + +=item PGDATABASE + +The name of the actual database containing the migration schema. + +=back + +This script may also setup a symlink from a specified Git repository to a +scripts/ directory within the migration work directory. The default for this is +~/git/migration-work/past_migrations/MIGSCHEMA --> MIGWORKDIR/scripts + +It may also create the migration work directory if necessary. + +=head1 COMMANDS + +=over 15 + +=item B + +This invocation will prompt for various values and create a .env file for the +specified migration schema, and a symlink between the specified Git repository +and migration work directory (which will also be created if needed). + +=item B + +This command will spawn a bash shell that executes the corresponding +~/.mig/.env script for setting up environment variables encoded during +B. + +=item B [schema] + +This command will show the contents of the corresponding ~/.mig/.env +script, or, if no schema is specified, then it will list pertinent variables in +the current environment if they exist. + +=item B + +This command will list migration schemas found in ~/.mig + +=item B + +Display the documentation you're reading now. + +=back + +=cut + +############################################################################### + +use strict; +use 5.012; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use File::Path qw(make_path); +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; + +pod2usage(-verbose => 2) if ! $ARGV[0]; + +my $migration_schema = $ARGV[1] || ''; +my $filename = "$HOME/.mig/$migration_schema.env"; +switch($ARGV[0]) { + case "--help" { + pod2usage(-verbose => 2); + } + case "help" { + pod2usage(-verbose => 2); + } + case "create" { + pod2usage(-verbose => 1) if ! $ARGV[1]; + mig_env_create(); + } + case "use" { + pod2usage(-verbose => 1) if ! $ARGV[1]; + if (-e $filename) { + exec '/bin/bash', '--init-file', $filename; + } else { + die "\n$filename does not exist\n"; + } + } + case "show" { + if (-e $filename) { + exec '/bin/cat', $filename; + } else { + print `env | sort | egrep 'MIG|PG'`; + } + } + case "list" { + opendir(my $dh, "$HOME/.mig") || die "can't open $HOME/.mig: $!"; + while (readdir $dh) { + if (/^(.*)\.env$/) { + print "$1\n"; + } + } + closedir $dh; + } + else { + pod2usage(1); + } +} + +sub mig_env_create { + if (-e $filename) { + print "Re-Creating $filename\n"; + print `cat $filename`; + } else { + print "Creating $filename\n"; + } + print "\n"; + + # directories + + $MIGBASEWORKDIR = "$HOME/data/" unless $MIGBASEWORKDIR; + my $migworkdir_default = "$MIGBASEWORKDIR$migration_schema/"; + print "Main work directory (default $migworkdir_default): "; + my $MIGWORKDIR = ; + chomp $MIGWORKDIR; + if (! $MIGWORKDIR) { + $MIGWORKDIR = $migworkdir_default; + } + $MIGBASEGITDIR = "$HOME/git/migration-work/" unless $MIGBASEGITDIR; + my $miggitdir_default = "${MIGBASEGITDIR}past_migrations/$migration_schema/"; + print "git repo for migration-specific scripts (default $miggitdir_default): "; + my $MIGGITDIR = ; + chomp $MIGGITDIR; + if (! $MIGGITDIR) { + $MIGGITDIR = $miggitdir_default; + } + + # PostgreSQL + + $PGHOST = 'localhost' unless $PGHOST; + my $pghost_default = $PGHOST; + print "PGHOST (default $pghost_default): "; + $PGHOST = ; + chomp $PGHOST; + if (! $PGHOST) { + $PGHOST = $pghost_default; + } + $PGPORT = 5432 unless $PGPORT; + my $pgport_default = $PGPORT; + print "PGPORT (default $pgport_default): "; + $PGPORT = ; + chomp $PGPORT; + if (! $PGPORT) { + $PGPORT = $pgport_default; + } + $PGDATABASE = 'evergreen' unless $PGDATABASE; + my $pgdatabase_default = $PGDATABASE; + print "PGDATABASE (default $pgdatabase_default): "; + $PGDATABASE = ; + chomp $PGDATABASE; + if (! $PGDATABASE) { + $PGDATABASE = $pgdatabase_default; + } + $PGUSER = $PGDATABASE unless $PGUSER; + my $pguser_default = $PGUSER; + print "PGUSER (default $pguser_default): "; + my $PGUSER = ; + chomp $PGUSER; + if (! $PGUSER) { + $PGUSER = $pguser_default; + } + + # create files and directories if needed + + mkdir "$HOME/.mig"; + make_path($MIGGITDIR, { verbose => 1 }); + `touch $MIGGITDIR/README`; + make_path($MIGWORKDIR, { verbose => 1 }); + symlink $MIGGITDIR, "$MIGWORKDIR/scripts"; + open FILE, ">$filename"; + print FILE "export PGHOST=$PGHOST\n"; + print FILE "export PGPORT=$PGPORT\n"; + print FILE "export PGDATABASE=$PGDATABASE\n"; + print FILE "export PGUSER=$PGUSER\n"; + print FILE "export MIGENVPROMPT=$migration_schema\n"; + print FILE "export MIGSCHEMA=$migration_schema\n"; + print FILE "export MIGBASEWORKDIR=$MIGBASEWORKDIR\n"; + print FILE "export MIGWORKDIR=$MIGWORKDIR\n"; + print FILE "export MIGBASEGITDIR=$MIGBASEGITDIR\n"; + print FILE "export MIGGITDIR=$MIGGITDIR\n"; + print FILE "alias wcd='cd `mig wdir`'\n"; + print FILE "alias gcd='cd `mig gdir`'\n"; + print FILE "alias scd='cd `mig sdir`'\n"; + print FILE "source ~/.profile\n"; + print FILE "env | sort | egrep 'PG|MIG'\n"; + print FILE 'echo shell PID = $$' . "\n"; + close FILE; +} + diff --git a/mig-bin/mig-iconv b/mig-bin/mig-iconv new file mode 100755 index 0000000..88acdd0 --- /dev/null +++ b/mig-bin/mig-iconv @@ -0,0 +1,107 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-iconv + +Attempts to invoke B on the specified tracked file, placing the +output in [file].iconv + +If given no other arguments, the invocation will lool like + +=over 5 + +iconv -f ISO-8859-1 -t UTF-8 -o .utf8 + +=back + +otherwise, the arguments will be passed through like so + +=over 5 + +iconv [other arguments...] -o .utf8 + +=back + +You'll need to invoke B prior to using commands like B + +=head1 SYNOPSIS + +B [other arguments...] + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +my $file = abs_path($ARGV[0]); +if ($file =~ /^$MIGBASEWORKDIR/) { + call_iconv(@ARGV); +} else { + print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n"; +} + +exit 0; + +############################################################################### + +sub call_iconv { + my $file = abs_path(shift); + my @args = @_; + + my $tracked_file_id = Mig::check_for_tracked_file($file); + if ($tracked_file_id) { + my $data = Mig::status_this_file($file); + print "iconv'ing tracked file: $file\n"; + + if (scalar(@args) == 0) { + @args = ( + '-f' + ,'ISO-8859-1' + ,'-t' + ,'UTF-8' + ,'--verbose' + ); + } + + system('iconv', @args, '-o', $file . '.utf8', $file); + system('touch', $file . '.utf8'); # handle 0-byte files + + my $dbh = Mig::db_connect(); + my $utf8_file = $dbh->quote($file . '.utf8'); + if (! -e $file . '.utf8') { + print "utf8 file does not exist: $utf8_file\n"; + $utf8_file = $dbh->quote(''); + } + + my $rv = $dbh->do(" + UPDATE $MIGSCHEMA.tracked_file + SET utf8_filename = $utf8_file + WHERE base_filename = " . $dbh->quote($file) . " + ; + ") || die "Error inserting into table $MIGSCHEMA.tracked_file: $!\n"; + Mig::db_disconnect($dbh); + } else { + print "File not currently tracked: $file\n"; + } +} diff --git a/mig-bin/mig-init b/mig-bin/mig-init new file mode 100755 index 0000000..05ad5d8 --- /dev/null +++ b/mig-bin/mig-init @@ -0,0 +1,121 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-init - This will add or recreate tracking tables for the B toolset to +the migration schema specified by the MIGSCHEMA environment variable, in the +PostgreSQL database specified by various PG environment variables. + +In practice, you should invoke 'mig env use schema_name' prior to calling +B + +=head1 SYNOPSIS + +B + +B + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if $ARGV[0]; + +Mig::die_if_no_env_migschema(); + +if (! Mig::check_for_db_migschema()) { + try_to_create_schema(); +} + +if (! Mig::check_db_migschema_for_migration_tables()) { + try_to_init_schema_with_migration_tools(); +} +vivicate_mig_tracking_tables(); + +exit 0; + +############################################################################### + +sub try_to_create_schema { + if ($MIGSCHEMA =~ /[^\w_]/) { + die "$MIGSCHEMA is not suitable for a schema name in PostgreSQL\n"; + } + my $dbh = Mig::db_connect(); + my $rv = $dbh->do("CREATE SCHEMA $MIGSCHEMA;") + || die "Error creating migration schema ($MIGSCHEMA): $!\n"; + print "Created schema $MIGSCHEMA\n"; + Mig::db_disconnect($dbh); +} + +sub try_to_init_schema_with_migration_tools { + Mig::die_if_no_migration_tools(); + print "Calling migration_tools.init() and .build()\n"; + my $dbh = Mig::db_connect(); + my $rv = $dbh->do("SELECT migration_tools.init(" . $dbh->quote($MIGSCHEMA) . ");") + || die "Error running migration_tools.init($MIGSCHEMA): $!\n"; + print "migration_tools.init() finished\n"; + my $rv2 = $dbh->do("SELECT migration_tools.build(" . $dbh->quote($MIGSCHEMA) . ");") + || die "Error running migration_tools.build($MIGSCHEMA): $!\n"; + print "migration_tools.build() finished\n"; + Mig::db_disconnect($dbh); +} + +sub vivicate_mig_tracking_tables { + Mig::die_if_mig_tracking_table_exists(); + print "Creating table $MIGSCHEMA.tracked_file...\n"; + my $dbh = Mig::db_connect(); + my $rv = $dbh->do(" + CREATE TABLE $MIGSCHEMA.tracked_file ( + id serial + ,base_filename TEXT UNIQUE + ,has_headers BOOLEAN + ,headers_file TEXT + ,utf8_filename TEXT + ,clean_filename TEXT + ,stage_sql_filename TEXT + ,map_sql_filename TEXT + ,prod_sql_filename TEXT + ,parent_table TEXT + ,staged_table TEXT + ); + ") || die "Error creating table $MIGSCHEMA.tracked_file: $!\n"; + Mig::die_if_mig_column_tracking_table_exists(); + my $rv2 = $dbh->do(" + CREATE TABLE $MIGSCHEMA.tracked_column ( + id serial + ,base_filename TEXT + ,parent_table TEXT + ,staged_table TEXT + ,staged_column TEXT + ,comment TEXT + ,target_table TEXT + ,target_column TEXT + ,transform TEXT + ,summarize BOOLEAN + ); + ") || die "Error creating table $MIGSCHEMA.tracked_column: $!\n"; + my $rv3 = $dbh->do(" + CREATE INDEX ON $MIGSCHEMA.tracked_column(target_table,target_column); + ") || die "Error creating index on $MIGSCHEMA.tracked_column: $!\n"; + my $rv4 = $dbh->do(" + CREATE INDEX ON $MIGSCHEMA.tracked_column(base_filename); + ") || die "Error creating index on $MIGSCHEMA.tracked_column: $!\n"; + Mig::db_disconnect($dbh); +} + + diff --git a/mig-bin/mig-link b/mig-bin/mig-link new file mode 100755 index 0000000..1a8ccd7 --- /dev/null +++ b/mig-bin/mig-link @@ -0,0 +1,87 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-link + +Associate the specified file with a parent table within the migration schema. + +=head1 SYNOPSIS + +B + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +my $file = abs_path($ARGV[0]); +if ($file =~ /^$MIGBASEWORKDIR/) { + link_table(@ARGV); +} else { + print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n"; +} + +exit 0; + +############################################################################### + +sub link_table { + my $file = abs_path(shift); + my $table = shift; + + if (! Mig::check_db_migschema_for_specific_table($table)) { + die "table not found in MIGSCHEMA ($MIGSCHEMA): $table\n"; + } + + my $tracked_file_id = Mig::check_for_tracked_file($file); + if ($tracked_file_id) { + my $data = Mig::status_this_file($file); + + print "linking file to parent table: $file -> $table\n"; + + my $dbh = Mig::db_connect(); + my $sth = $dbh->prepare(" + SELECT base_filename + FROM $MIGSCHEMA.tracked_file + WHERE parent_table = " . $dbh->quote($table) . " + AND base_filename <> " . $dbh->quote($file) . ";" + ); + my $rv = $sth->execute() + || die "Error checking $MIGSCHEMA.tracked_file: $!"; + my @cols = $sth->fetchrow_array; + $sth->finish; + if ($cols[0]) { # found + die "table ($table) already linked to a different file: $cols[0]\n"; + } + $rv = $dbh->do(" + UPDATE $MIGSCHEMA.tracked_file + SET parent_table = " . $dbh->quote($table) . " + WHERE base_filename = " . $dbh->quote($file) . " + ; + ") || die "Error updating table $MIGSCHEMA.tracked_file: $!\n"; + Mig::db_disconnect($dbh); + } else { + print "File not currently tracked: $file\n"; + } +} diff --git a/mig-bin/mig-mapper b/mig-bin/mig-mapper new file mode 100755 index 0000000..6841cf7 --- /dev/null +++ b/mig-bin/mig-mapper @@ -0,0 +1,778 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-mapper + +Interactive session for analyzing, flagging, and mapping legacy field data to +Evergreen fields. + +Upon exit, generate either [file].clean.map.sql or _map.sql. The +SQL generated will be UPDATE's for setting the Evergreen-specific columns for a +given file's staging tables, and TRUNCATE's and INSERT's for auxilary tables. +The files will have \include hooks for pulling in additional mapping files +(for example, end-user mappings for circ modifiers, etc.) + +=head1 SYNOPSIS + +B + +=cut + +############################################################################### + +use strict; +use Term::ReadLine; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +my $column_filter = 1; # show all fields +my $file = abs_path($ARGV[0]); +my $fdata; +my $tracked_file_id = Mig::check_for_tracked_file($file); +if ($tracked_file_id) { + $fdata = Mig::status_this_file($file); +} else { + die "File not currently tracked: $file\n"; +} + +my $table = $fdata->{staged_table}; +if (!$table) { + die "No staged staged table for file: $file\n"; +} + +my $loop = 1; +my $term = Term::ReadLine->new('mapper'); +my $prompt; +my $OUT = $term->OUT || \*STDOUT; +my @dtd_identifiers; + +table_menu(); +$prompt = "$fdata->{staged_table}: "; +while ( $loop && defined (my $cmd = $term->readline($prompt)) ) { +top: + $cmd =~ s/^\s+//; + $cmd =~ s/\s+$//; + $term->addhistory($cmd) if $cmd =~ /\S/; + if ($cmd =~ /^\d+$/) { + my $ret = column_menu($cmd); + if ($ret) { + $cmd = $ret; + goto top; + } + } else { + switch($cmd) { + case /^(ls|\?|\.|;)$/ { + table_menu(); + } + case '' { + table_menu(); + } + case 'l' { + list_ten(); + } + case 'f1' { + $column_filter = 1; + table_menu(); + } + case 'f2' { + $column_filter = 2; + table_menu(); + } + case 'f3' { + $column_filter = 3; + table_menu(); + } + } + } + $loop = 0 if $cmd =~ /^q/io; +} + +exit 0; + +############################################################################### + +sub table_menu { + print "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n"; + print "$table"; + print "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n"; + print "\n"; + print " l) SELECT * FROM $fdata->{staged_table} LIMIT 10;\n"; + print "f1) show all fields (default)\n"; + print "f2) show legacy fields\n"; + print "f3) show EG fields\n"; + print " q) quit\n\n"; + printf "%-36s", "Columns (* for required)"; + printf "%-30s", "Target"; + printf "%-30s", "Transform"; + printf "%-30s", "First Row"; + printf "%-30s", "Migration Note"; + print "\n"; + printf "%-36s", "-------"; + printf "%-30s", "------"; + printf "%-30s", "---------"; + printf "%-30s", "---------"; + printf "%-30s", "--------------"; + print "\n"; + my $dbh = Mig::db_connect(); + my $sth = $dbh->prepare(" + SELECT * + FROM information_schema.columns + WHERE table_schema = " . $dbh->quote($MIGSCHEMA) . " + AND table_name = " . $dbh->quote($table) . " + ORDER BY dtd_identifier::INTEGER ASC; + "); + my $rv = $sth->execute() + || die "Error retrieving data from information_schema: $!"; + my $sth2 = $dbh->prepare(" + SELECT * + FROM $MIGSCHEMA.$table + LIMIT 1; + "); + my $rv2 = $sth2->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + my $row = $sth2->fetchrow_hashref; + + open LESS, "|less -F"; + @dtd_identifiers = (); + while (my $data = $sth->fetchrow_hashref) { + my $column = $data->{column_name}; + if ($column_filter == 2 && !($column =~ /^[xl]_/)) { + next; + } + if ($column_filter == 3 && ($column =~ /^[xl]_/)) { + next; + } + my $cdata = status_this_column($column); + printf LESS $cdata->{required} ? '*' : ' '; + printf LESS "%3s) ", $data->{dtd_identifier}; + push @dtd_identifiers, $data->{dtd_identifier}; + printf LESS "%-30s", $column; + printf LESS "%-30s", defined $cdata->{target_table} + ? ( $cdata->{target_table} ne $table ? $cdata->{target_table} . '.' : '') . $cdata->{target_column} + : ''; + printf LESS "%-30s", defined $cdata->{transform} ? $cdata->{transform} : ''; + printf LESS "%-30s", defined $$row{$column} ? $$row{$column} : ''; + printf LESS "%-30s", defined $cdata->{comment} ? $cdata->{comment} : ''; + print LESS "\n"; + } + close LESS; + print "\n"; + $sth->finish; + $sth2->finish; + Mig::db_disconnect($dbh); +} + +sub column_menu { + my $dtd_identifier = shift; + my $dbh = Mig::db_connect(); + my $sth = $dbh->prepare(" + SELECT * + FROM information_schema.columns + WHERE table_schema = " . $dbh->quote($MIGSCHEMA) . " + AND table_name = " . $dbh->quote($table) . " + AND dtd_identifier = " . $dbh->quote($dtd_identifier) . "; + "); + my $rv = $sth->execute() + || die "Error retrieving data from information_schema: $!"; + my $data = $sth->fetchrow_hashref; + $sth->finish; + Mig::db_disconnect($dbh); + + my $column = $data->{column_name}; + + my $prompt = "$table.$column: "; + + sub print_menu { + my $column = shift; + my $cdata = status_this_column($column); + print "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n"; + print "$column"; + print "\n-------------------------------------------------------------------------------------------------\n"; + print " target: " . ( + defined $cdata->{target_table} + ? ( $cdata->{target_table} ne $table ? $cdata->{target_table} . '.' : '') . $cdata->{target_column} + : '' + ) . "\n"; + print "transform: " . (defined $cdata->{transform} ? $cdata->{transform} : '') . "\n"; + print " comment: " . (defined $cdata->{comment} ? $cdata->{comment} : '') . "\n"; + print "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n"; + print "\n"; + print " l) SELECT $column FROM $fdata->{staged_table} LIMIT 10;\n"; + print " s) summarize\n" if $column ne 'x_migrate'; + print " g) group browse\n"; + print "g2) group browse (order by count desc)\n"; + print " c) comment\n"; + print " f) flag for end-user mapping\n"; + print " t) target\n"; + print " e) eval/transform\n"; + print " n) next column\n"; + print " p) prev column\n"; + print " q) quit back to table menu\n"; + print "\n"; + } + print_menu($column); + + my $loop = 1; + while ( $loop && defined (my $cmd = $term->readline($prompt)) ) { + $cmd =~ s/^\s+//; + $cmd =~ s/\s+$//; + $term->addhistory($cmd) if $cmd =~ /\S/; + $loop = 0 if $cmd =~ /^q/io; + switch($cmd) { + case /^(ls|\?|\.|;)$/ { + print_menu($column); + } + case '' { + print_menu($column); + } + case 'l' { + list_ten($column); + } + case 's' { + summarize($column); + } + case 'g' { + group_browse($column); + } + case 'g2' { + group_browse($column,'GROUP BY 1 ORDER BY 2 DESC'); + } + case /^c/io { + if ($cmd =~ /^c\s+(.+)$/) { + set_comment($column,$1); + } + } + case /^t/io { + if ($cmd =~ /^t\s+(.+)$/) { + set_target($column,$1); + } + } + case /^e/io { + if ($cmd =~ /^e\s+(.+)$/) { + set_transform($column,$1); + } + } + case 'n' { + my( $index )= grep { $dtd_identifiers[$_] eq $dtd_identifier } 0..$#dtd_identifiers; + return $dtd_identifiers[$index + 1]; + } + case 'p' { + my( $index )= grep { $dtd_identifiers[$_] eq $dtd_identifier } 0..$#dtd_identifiers; + return $dtd_identifiers[$index - 1]; + } + } + } +} + +sub list_ten { + my $column = shift; + + my $dbh = Mig::db_connect(); + my $sth; + my $rv; + my @cols; + + $sth = $dbh->prepare(Mig::sql(" + SELECT " . (defined $column ? $column : '*') . " + FROM $MIGSCHEMA.$table + LIMIT 10; + ")); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + print "\n"; + while (@cols = $sth->fetchrow_array) { + print "\t" . join(',',map {defined $_ ? $_ : ''} @cols) . "\n"; + } + print "\n"; + $sth->finish; +} + +sub summarize { + my $column = shift; + + my $count; + my $non_empty_count; + my $distinct_value_count; + my $distinct_integer_value_count; + my $distinct_money6_value_count; + my $distinct_money8_value_count; + my $distinct_date_value_count; + my $distinct_timestamptz_value_count; + + my $min_value; + my $min_length; + my $min_length_min_value; + my $max_value; + my $max_length; + my $max_length_max_value; + + my $min_value_as_integer; + my $max_value_as_integer; + + my $min_value_as_money6; + my $max_value_as_money6; + + my $min_value_as_money8; + my $max_value_as_money8; + + my $min_value_as_date; + my $max_value_as_date; + + my $min_value_as_timestamptz; + my $max_value_as_timestamptz; + + my $dbh = Mig::db_connect(); + my $sth; + my $rv; + my @cols; + + ### count + $sth = $dbh->prepare(" + SELECT COUNT(*) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $count = $cols[0]; + + ### non_empty_count + $sth = $dbh->prepare(" + SELECT COUNT(*) + FROM $MIGSCHEMA.$table + WHERE $column IS NOT NULL AND BTRIM($column) <> ''; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $non_empty_count = $cols[0]; + + ### distinct_value_count + $sth = $dbh->prepare(" + SELECT COUNT(DISTINCT $column) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $distinct_value_count = $cols[0]; + + ### distinct_integer_value_count + $sth = $dbh->prepare(" + SELECT COUNT(DISTINCT migration_tools.attempt_cast($column,'INTEGER')::INTEGER) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $distinct_integer_value_count = $cols[0]; + + ### distinct_money6_value_count + $sth = $dbh->prepare(" + SELECT COUNT(DISTINCT NULLIF(migration_tools.attempt_money6($column,'-0.01'),-0.01)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $distinct_money6_value_count = $cols[0]; + + ### distinct_money8_value_count + $sth = $dbh->prepare(" + SELECT COUNT(DISTINCT NULLIF(migration_tools.attempt_money($column,'-0.01'),-0.01)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $distinct_money8_value_count = $cols[0]; + + ### distinct_date_value_count + $sth = $dbh->prepare(" + SELECT COUNT(DISTINCT NULLIF(migration_tools.attempt_date($column,'1969-06-09'),'1969-06-09'::DATE)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $distinct_date_value_count = $cols[0]; + + ### distinct_timestamptz_value_count + $sth = $dbh->prepare(" + SELECT COUNT(DISTINCT NULLIF(migration_tools.attempt_timestamptz($column,'1969-06-09'),'1969-06-09'::TIMESTAMPTZ)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $distinct_timestamptz_value_count = $cols[0]; + + ### min_value + $sth = $dbh->prepare(" + SELECT MIN($column) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $min_value = $cols[0]; + + ### min_length + $sth = $dbh->prepare(" + SELECT MIN(LENGTH($column)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $min_length = $cols[0]; + + ### min_length_min_value + $sth = $dbh->prepare(" + SELECT MIN($column) + FROM $MIGSCHEMA.$table + WHERE LENGTH($column) = $min_length; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $min_length_min_value = $cols[0]; + + ### min_value_as_integer + $sth = $dbh->prepare(" + SELECT MIN(migration_tools.attempt_cast($column,'INTEGER')::INTEGER) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $min_value_as_integer = $cols[0]; + + ### min_value_as_money6 + $sth = $dbh->prepare(" + SELECT MIN(NULLIF(migration_tools.attempt_money6($column,'-0.01'),-0.01)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $min_value_as_money6 = $cols[0]; + + ### min_value_as_money8 + $sth = $dbh->prepare(" + SELECT MIN(NULLIF(migration_tools.attempt_money($column,'-0.01'),-0.01)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $min_value_as_money8 = $cols[0]; + + ### min_value_as_date + $sth = $dbh->prepare(" + SELECT MIN(NULLIF(migration_tools.attempt_date($column,'1969-06-09'),'1969-06-09'::DATE)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $min_value_as_date = $cols[0]; + + ### min_value_as_timestamptz + $sth = $dbh->prepare(" + SELECT MIN(NULLIF(migration_tools.attempt_timestamptz($column,'1969-06-09'),'1969-06-09'::TIMESTAMPTZ)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $min_value_as_timestamptz = $cols[0]; + + ### max_value + $sth = $dbh->prepare(" + SELECT MAX($column) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $max_value = $cols[0]; + + ### max_length + $sth = $dbh->prepare(" + SELECT MAX(LENGTH($column)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $max_length = $cols[0]; + + ### max_length_max_value + $sth = $dbh->prepare(" + SELECT MAX($column) + FROM $MIGSCHEMA.$table + WHERE LENGTH($column) = $max_length; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $max_length_max_value = $cols[0]; + + ### max_value_as_integer + $sth = $dbh->prepare(" + SELECT MAX(migration_tools.attempt_cast($column,'INTEGER')::INTEGER) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $max_value_as_integer = $cols[0]; + + ### max_value_as_money6 + $sth = $dbh->prepare(" + SELECT MAX(NULLIF(migration_tools.attempt_money6($column,'-0.01'),-0.01)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $max_value_as_money6 = $cols[0]; + + ### max_value_as_money8 + $sth = $dbh->prepare(" + SELECT MAX(NULLIF(migration_tools.attempt_money($column,'-0.01'),-0.01)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $max_value_as_money8 = $cols[0]; + + ### max_value_as_date + $sth = $dbh->prepare(" + SELECT MAX(NULLIF(migration_tools.attempt_date($column,'1969-06-09'),'1969-06-09'::DATE)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $max_value_as_date = $cols[0]; + + ### max_value_as_timestamptz + $sth = $dbh->prepare(" + SELECT MAX(NULLIF(migration_tools.attempt_timestamptz($column,'1969-06-09'),'1969-06-09'::TIMESTAMPTZ)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + $max_value_as_timestamptz = $cols[0]; + + Mig::db_disconnect($dbh); + + print "\n"; + print "# of rows = $count\n"; + print "# of non-empty rows = $non_empty_count\n"; + print "# of distinct values (as text) = $distinct_value_count\n"; + print "# of distinct values (as integer) = $distinct_integer_value_count\n"; + print "# of distinct values (as money6) = $distinct_money6_value_count\n"; + print "# of distinct values (as money8) = $distinct_money8_value_count\n"; + print "# of distinct values (as date) = $distinct_date_value_count\n"; + print "# of distinct values (as timestamptz) = $distinct_timestamptz_value_count\n"; + print "\n"; + print "minimum value (as text) = $min_value\n"; + print "maximum value (as text) = $max_value\n"; + print "\n"; + print "minimum value length (as text) = $min_length (min value: $min_length_min_value)\n"; + print "maximum value length (as text) = $max_length (max value: $max_length_max_value)\n"; + print "\n"; + print "minimum value (as integer) = " . ($min_value_as_integer ? $min_value_as_integer : '') . "\n"; + print "maximum value (as integer) = " . ($max_value_as_integer ? $max_value_as_integer : '') . "\n"; + print "\n"; + print "minimum value (as money6) = " . ($min_value_as_money6 ? $min_value_as_money6 : '') . "\n"; + print "maximum value (as money6) = " . ($max_value_as_money6 ? $max_value_as_money6 : '') . "\n"; + print "\n"; + print "minimum value (as money8) = " . ($min_value_as_money8 ? $min_value_as_money8 : '') . "\n"; + print "maximum value (as money8) = " . ($max_value_as_money8 ? $max_value_as_money8 : '') . "\n"; + print "\n"; + print "minimum value (as date) = " . ($min_value_as_date ? $min_value_as_date : '') . "\n"; + print "maximum value (as date) = " . ($max_value_as_date ? $max_value_as_date : '') . "\n"; + print "\n"; + print "minimum value (as timestamptz) = " . ($min_value_as_timestamptz ? $min_value_as_timestamptz : '') . "\n"; + print "maximum value (as timestamptz) = " . ($max_value_as_timestamptz ? $max_value_as_timestamptz : '') . "\n"; + print "\n"; +} + +sub group_browse { + my ($column,$option) = (shift,shift||"GROUP BY 1 ORDER BY 1"); + + my $dbh = Mig::db_connect(); + my $sth; + my $rv; + + $sth = $dbh->prepare(Mig::sql(" + SELECT $column, COUNT(*) + FROM $MIGSCHEMA.$table + $option; + ")); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + + print "\n"; + open LESS, "|less -F"; + printf LESS "%-30s", "Value:"; + print LESS "Count:\n\n"; + while (my @cols = $sth->fetchrow_array) { + my $value = $cols[0]; + my $count = $cols[1]; + printf LESS "%-30s", defined $value ? $value : ''; + print LESS "$count\n"; + } + close LESS; + print "\n"; + $sth->finish; +} + +############################################################################### + +sub add_this_column { + my $column = shift; + if (!Mig::check_for_tracked_column($table,$column)) { + my $dbh = Mig::db_connect(); + my $rv = $dbh->do(" + INSERT INTO $MIGSCHEMA.tracked_column ( + base_filename + ,parent_table + ,staged_table + ,staged_column + ) VALUES ( + " . $dbh->quote($file) . " + ," . $dbh->quote($fdata->{parent_table}) . " + ," . $dbh->quote($table) . " + ," . $dbh->quote($column) . " + ); + ") || die "Error inserting into table $MIGSCHEMA.tracked_column: $!\n"; + Mig::db_disconnect($dbh); + } +} + +sub status_this_column { + my $column = shift; + my $data = Mig::status_this_column($table,$column); + if (!$data) { + add_this_column($column); + $data = Mig::status_this_column($table,$column); + } + if ($$data{parent_table}) { + my $dbh = Mig::db_connect(); + my $sth = $dbh->prepare(" + SELECT * + FROM $MIGSCHEMA.fields_requiring_mapping + WHERE table_name = " . $dbh->quote( $$data{parent_table} ) . " + AND column_name = " . $dbh->quote( $column ) . ";" + ); + my $rv = $sth->execute() + || die "Error checking table (tracked_column) for $table.$column: $!"; + my $data2 = $sth->fetchrow_hashref; + if ($data2) { + $$data{required} = 1; + } else { + $$data{required} = 0; + } + $sth->finish; + Mig::db_disconnect($dbh); + } + return $data; +} + +sub set_comment { + my ($column,$comment) = (shift,shift); + if ($comment) { + my $data = status_this_column($column); + my $dbh = Mig::db_connect(); + my $rv = $dbh->do(" + UPDATE $MIGSCHEMA.tracked_column + SET comment = " . $dbh->quote($comment) . " + WHERE id = " . $dbh->quote($data->{id}) . "; + ") || die "Error updating table $MIGSCHEMA.tracked_column: $!\n"; + Mig::db_disconnect($dbh); + } +} + +sub set_transform { + my ($column,$transform) = (shift,shift); + if ($transform) { + my $data = status_this_column($column); + my $dbh = Mig::db_connect(); + my $rv = $dbh->do(" + UPDATE $MIGSCHEMA.tracked_column + SET transform = " . $dbh->quote($transform) . " + WHERE id = " . $dbh->quote($data->{id}) . "; + ") || die "Error updating table $MIGSCHEMA.tracked_column: $!\n"; + Mig::db_disconnect($dbh); + } +} + +sub set_target { + my ($column,$target) = (shift,shift); + my $target_table = $table; + my $target_column = $target; + if ($target) { + if ($target =~ /^(.+)\.(.+)$/) { + $target_table = $1; + $target_column = $2; + } + my $data = status_this_column($column); + my $dbh = Mig::db_connect(); + my $rv = $dbh->do(" + UPDATE $MIGSCHEMA.tracked_column + SET target_table = " . $dbh->quote($target_table) . " + ,target_column = " . $dbh->quote($target_column) . " + WHERE id = " . $dbh->quote($data->{id}) . "; + ") || die "Error updating table $MIGSCHEMA.tracked_column: $!\n"; + Mig::db_disconnect($dbh); + } +} diff --git a/mig-bin/mig-quicksheet b/mig-bin/mig-quicksheet new file mode 100755 index 0000000..baf19c7 --- /dev/null +++ b/mig-bin/mig-quicksheet @@ -0,0 +1,286 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-quicksheet + +Quickly produces a simple Excel spreadsheet based on the tracked file suitable +for simple end-user mapping. The new file is named after the tracked file, but +ends in .mapping.xls + +=head1 SYNOPSIS + +B + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Spreadsheet::WriteExcel; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +my $has_x_source = 0; +my $file = abs_path($ARGV[0]); +my $fdata; +my $tracked_file_id = Mig::check_for_tracked_file($file); +if ($tracked_file_id) { + $fdata = Mig::status_this_file($file); +} else { + die "File not currently tracked: $file\n"; +} + +my $table = $fdata->{staged_table}; +if (!$table) { + die "No staged staged table for file: $file\n"; +} + +my $workbook = Spreadsheet::WriteExcel->new($file . '.mapping.xls'); +my @worksheets = (); +print "Writing $file.mapping.xls\n"; +print "Sheet: Field Summary\n"; +my $first_sheet = $workbook->add_worksheet('Field Summary'); +my $bold = $workbook->add_format(); +$bold->set_bold(); +$bold->set_align('left'); +my $left = $workbook->add_format(); +$left->set_align('left'); +$first_sheet->set_column(0,6,30); + +my $dbh = Mig::db_connect(); +my $sth = $dbh->prepare(" + SELECT COUNT(*) + FROM $MIGSCHEMA.$table + LIMIT 1; +"); +my $rv = $sth->execute() + || die "Error retrieving data from information_schema: $!"; + +my @cols = $sth->fetchrow_array; +$sth->finish; +my $count = $cols[0]; +Mig::db_disconnect($dbh); + +$first_sheet->write(0,0,'Source File:',$bold); +$first_sheet->write(0,1,$file,$left); +$first_sheet->write(1,0,'Number of Rows:',$bold); +$first_sheet->write(1,1,$count,$left); + +my $sheet_row_start = 4; + +$first_sheet->write($sheet_row_start,0,'Legacy Column',$bold); +$first_sheet->write($sheet_row_start,1,'Non-Empty Rows',$bold); +$first_sheet->write($sheet_row_start,2,'Distinct Non-NULL Values',$bold); +$first_sheet->write($sheet_row_start,3,'Min Value',$bold); +$first_sheet->write($sheet_row_start,4,'Min Length',$bold); +$first_sheet->write($sheet_row_start,5,'Max Value',$bold); +$first_sheet->write($sheet_row_start,6,'Max Length',$bold); + +handle_columns(); + +$workbook->close(); + +exit 0; + +############################################################################### + +sub handle_columns { + my $dbh = Mig::db_connect(); + my $sth = $dbh->prepare(" + SELECT * + FROM information_schema.columns + WHERE table_schema = " . $dbh->quote($MIGSCHEMA) . " + AND table_name = " . $dbh->quote($table) . " + ORDER BY dtd_identifier::INTEGER ASC; + "); + my $rv = $sth->execute() + || die "Error retrieving data from information_schema: $!"; + + my $sheet_row_offset = 0; + + while (my $data = $sth->fetchrow_hashref) { + my $column = $data->{column_name}; + if ($column eq 'x_source') { + $has_x_source = 1; + } + if ($column =~ /^l_/ + || ($column =~ /^x_/ + && ( $column ne 'x_migrate' + && $column ne 'x_source' + && $column ne 'x_egid' + && $column ne 'x_hseq' + ) + ) + ) { + $sheet_row_offset++; + my $cdata = column_summary($column); + $first_sheet->write($sheet_row_start + $sheet_row_offset,0,$column,$left); + $first_sheet->write($sheet_row_start + $sheet_row_offset,1,$cdata->{non_empty_count},$left); + $first_sheet->write($sheet_row_start + $sheet_row_offset,2,$cdata->{distinct_value_count},$left); + $first_sheet->write($sheet_row_start + $sheet_row_offset,3,$cdata->{min_value},$left); + $first_sheet->write($sheet_row_start + $sheet_row_offset,4,$cdata->{min_length},$left); + $first_sheet->write($sheet_row_start + $sheet_row_offset,5,$cdata->{max_value},$left); + $first_sheet->write($sheet_row_start + $sheet_row_offset,6,$cdata->{max_length},$left); + if ($cdata->{distinct_value_count} > 1 && $cdata->{distinct_value_count} <= 500) { + group_by($column); + } + } + } + $sth->finish; + Mig::db_disconnect($dbh); +} + +sub column_summary { + + my $column = shift; + + my $dbh = Mig::db_connect(); + + ### non_empty_count + my $sth = $dbh->prepare(" + SELECT COUNT(*) + FROM $MIGSCHEMA.$table + WHERE $column IS NOT NULL AND BTRIM($column) <> ''; + "); + my $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + my @cols = $sth->fetchrow_array; + $sth->finish; + my $non_empty_count = $cols[0]; + + ### distinct_value_count + $sth = $dbh->prepare(" + SELECT COUNT(DISTINCT $column) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + my $distinct_value_count = $cols[0]; + + ### min_value + $sth = $dbh->prepare(" + SELECT MIN($column) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + my $min_value = $cols[0]; + + ### min_length + $sth = $dbh->prepare(" + SELECT MIN(LENGTH($column)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + my $min_length = $cols[0]; + + ### max_value + $sth = $dbh->prepare(" + SELECT MAX($column) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + my $max_value = $cols[0]; + + ### max_length + $sth = $dbh->prepare(" + SELECT MAX(LENGTH($column)) + FROM $MIGSCHEMA.$table; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + @cols = $sth->fetchrow_array; + $sth->finish; + my $max_length = $cols[0]; + + return { + non_empty_count => $non_empty_count + ,distinct_value_count => $distinct_value_count + ,min_value => defined $min_value ? $min_value : '' + ,min_length => defined $min_length ? $min_length : '' + ,max_value => defined $max_value ? $max_value : '' + ,max_length => defined $max_length ? $max_length : '' + }; +} + +sub group_by { + my ($column,$option) = (shift,"GROUP BY 2 ORDER BY 2"); + + print "Sheet: $column\n"; + my $col_sheet = $workbook->add_worksheet(substr($column,0,31)); + push @worksheets, $col_sheet; + $col_sheet->set_column(0,6,30); + + my $col_sheet_row_start = 0; + my $col_sheet_row_offset = 0; + + $col_sheet->write($col_sheet_row_start + $col_sheet_row_offset,0,'Count',$bold); + if ($has_x_source) { + $col_sheet->write($col_sheet_row_start + $col_sheet_row_offset,1,'Source',$bold); + $option = "GROUP BY 2,3 ORDER BY 2,3"; + } + $col_sheet->write( + $col_sheet_row_start + $col_sheet_row_offset + ,$has_x_source ? 2 : 1 + ,"Legacy Value for $column" + ,$bold + ); + + my $dbh = Mig::db_connect(); + my $sth; + my $rv; + + $sth = $dbh->prepare(" + SELECT COUNT(*), " . ($has_x_source ? 'x_source, ' : '') . "$column + FROM $MIGSCHEMA.$table + $option; + "); + $rv = $sth->execute() + || die "Error retrieving data from $MIGSCHEMA.$table: $!"; + + while (my @cols = $sth->fetchrow_array) { + $col_sheet_row_offset++; + my $count = $cols[0]; + $col_sheet->write($col_sheet_row_start + $col_sheet_row_offset,0,$count,$left); + my $value; + if ($has_x_source) { + my $source = defined $cols[1] ? $cols[1] : ''; + $col_sheet->write($col_sheet_row_start + $col_sheet_row_offset,1,$source,$left); + $value = defined $cols[2] ? $cols[2] : ''; + $col_sheet->write($col_sheet_row_start + $col_sheet_row_offset,2,$value,$left); + } else { + $value = defined $cols[1] ? $cols[1] : ''; + $col_sheet->write($col_sheet_row_start + $col_sheet_row_offset,1,$value,$left); + } + } + $sth->finish; +} + diff --git a/mig-bin/mig-remove b/mig-bin/mig-remove new file mode 100755 index 0000000..cf70eda --- /dev/null +++ b/mig-bin/mig-remove @@ -0,0 +1,67 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-remove - This will remove the specified files from the mig tracking table +for the schema pointed to by the MIGSCHEMA environment variable in the +PostgreSQL database specified by various PG environment variables. + +You'll need to invoke B prior to using commands like B + +=head1 SYNOPSIS + +B [file] [...] + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +foreach my $arg (@ARGV) { + my $file = abs_path($arg); + if ($file =~ /^$MIGBASEWORKDIR/) { + remove_this_file($file); + } else { + print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n"; + } +} + +exit 0; + +############################################################################### + +sub remove_this_file { + my $file = shift; + my $tracked_file_id = Mig::check_for_tracked_file($file,{'allow_missing'=>1}); + if ($tracked_file_id) { + print "removing tracked file: $file\n"; + my $dbh = Mig::db_connect(); + my $rv = $dbh->do(" + DELETE FROM $MIGSCHEMA.tracked_file WHERE id = $tracked_file_id; + ") || die "Error deleting from table $MIGSCHEMA.tracked_file (id = $tracked_file_id): $!\n"; + Mig::db_disconnect($dbh); + } else { + print "File not currently tracked: $file\n"; + } +} diff --git a/mig-bin/mig-skip-clean b/mig-bin/mig-skip-clean new file mode 100755 index 0000000..013c075 --- /dev/null +++ b/mig-bin/mig-skip-clean @@ -0,0 +1,98 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-skip-clean + +Allows you to either use an existing file named .utf8.clean or a +named [cleaned file] as if it were the one created by mig-clean + +Note that the clean file, however specified, should contain headers. The +remaining tools in the chain will expect this. + +=head1 SYNOPSIS + +B [cleaned file] + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if ! ($ARGV[0]||$ARGV[1]) || $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +my $file = abs_path($ARGV[0]); +my $clean_file; +if ($ARGV[1]) { + $clean_file = abs_path($ARGV[1]); +} +if ($clean_file && ! $clean_file =~ /^$MIGBASEWORKDIR/) { + die "File falls outside of MIGWORKDIR ($MIGWORKDIR): $clean_file\n"; +} + +if ($file =~ /^$MIGBASEWORKDIR/) { + skip_clean($file,$clean_file); +} else { + die "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n"; +} + +exit 0; + +############################################################################### + +sub skip_clean { + my $file = shift; + my $clean_file = shift; + + my $tracked_file_id = Mig::check_for_tracked_file($file); + if ($tracked_file_id) { + my $data = Mig::status_this_file($file); + + if (! $data->{'utf8_filename'}) { + die "mig-iconv or mig-skip-iconv needed for UTF8 version of file: $file\n"; + } + + my $utf8_file = $data->{'utf8_filename'}; + if (! -e $utf8_file) { + die "missing file: $utf8_file\n"; + } + + print "skipping cleaning of tracked file: $file\n"; + + my $dbh = Mig::db_connect(); + if (! $clean_file) { + $clean_file = $utf8_file . '.clean'; + } + if (! -e $clean_file) { + die "clean file does not exist: $clean_file\n"; + } + + my $rv = $dbh->do(" + UPDATE $MIGSCHEMA.tracked_file + SET clean_filename = " . $dbh->quote($clean_file) . " + WHERE base_filename = " . $dbh->quote($file) . " + ; + ") || die "Error inserting into table $MIGSCHEMA.tracked_file: $!\n"; + Mig::db_disconnect($dbh); + } else { + die "File not currently tracked: $file\n"; + } +} diff --git a/mig-bin/mig-skip-iconv b/mig-bin/mig-skip-iconv new file mode 100755 index 0000000..9a36123 --- /dev/null +++ b/mig-bin/mig-skip-iconv @@ -0,0 +1,85 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-skip-iconv + +Allows you to either use an existing file named .utf8 or a named +[utf8 file] as if it were the one created by mig-iconv + +=head1 SYNOPSIS + +B [utf8 file] + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if ! ($ARGV[0]||$ARGV[1]) || $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +my $file = abs_path($ARGV[0]); +my $utf8_file; +if ($ARGV[1]) { + $utf8_file = abs_path($ARGV[1]); +} +if ($utf8_file && ! $utf8_file =~ /^$MIGBASEWORKDIR/) { + die "File falls outside of MIGWORKDIR ($MIGWORKDIR): $utf8_file\n"; +} + +if ($file =~ /^$MIGBASEWORKDIR/) { + skip_iconv($file,$utf8_file); +} else { + print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n"; +} + +exit 0; + +############################################################################### + +sub skip_iconv { + my $file = shift; + my $utf8_file = shift; + + my $tracked_file_id = Mig::check_for_tracked_file($file); + if ($tracked_file_id) { + my $data = Mig::status_this_file($file); + print "skipping the iconv'ing of tracked file: $file\n"; + + my $dbh = Mig::db_connect(); + if (! $utf8_file) { + $utf8_file = $file . '.utf8'; + } + if (! -e $utf8_file) { + die "utf8 file does not exist: $utf8_file\n"; + } + + my $rv = $dbh->do(" + UPDATE $MIGSCHEMA.tracked_file + SET utf8_filename = " . $dbh->quote($utf8_file) . " + WHERE base_filename = " . $dbh->quote($file) . " + ; + ") || die "Error inserting into table $MIGSCHEMA.tracked_file: $!\n"; + Mig::db_disconnect($dbh); + } else { + print "File not currently tracked: $file\n"; + } +} diff --git a/mig-bin/mig-stage b/mig-bin/mig-stage new file mode 100755 index 0000000..6e7faf5 --- /dev/null +++ b/mig-bin/mig-stage @@ -0,0 +1,128 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-stage + +Load the SQL-converted version of the specified file into the migration schema. + +Extra arguments are passed to the underlying call to psql + +If the tracked file was previously staged with a different table, drop that +table. + + +=head1 SYNOPSIS + +B [other arguments...] + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +my $file = abs_path($ARGV[0]); +if ($file =~ /^$MIGBASEWORKDIR/) { + stage_csv(@ARGV); +} else { + print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n"; +} + +exit 0; + +############################################################################### + +sub stage_csv { + my $file = abs_path(shift); + my @args = @_; + + my $tracked_file_id = Mig::check_for_tracked_file($file); + if ($tracked_file_id) { + my $data = Mig::status_this_file($file); + + if (! $data->{'utf8_filename'}) { + die "mig-iconv or mig-skip-iconv needed for UTF8 version of file: $file\n"; + } + + if (! $data->{'clean_filename'}) { + die "mig-clean or mig-skip-clean needed for .clean version of file: $file\n"; + } + + if (! $data->{'stage_sql_filename'}) { + die "mig-convert needed for .stage.sql version of file: $file\n"; + } + + my $stage_sql_filename = $data->{'stage_sql_filename'}; + if (! -e $stage_sql_filename) { + die "missing file: $stage_sql_filename\n"; + } + + my $schema_table = `grep 'CREATE UNLOGGED TABLE' $stage_sql_filename | cut -f4 -d\\ | head -1`; + chomp $schema_table; + my ($schema,$table) = split /\./, $schema_table; + + if (defined $data->{'staged_table'} && $data->{'staged_table'} ne $table) { + my $dbh2 = Mig::db_connect(); + print "dropping previously staged table: $MIGSCHEMA.$data->{staged_table}\n"; + my $rv2 = $dbh2->do(" + DROP TABLE $MIGSCHEMA.$data->{staged_table}; + ") || die "Error dropping table $data->{staged_table}: $!\n"; + print "changing references to old tables\n"; + my $rv3 = $dbh2->do(" + UPDATE $MIGSCHEMA.tracked_column + SET staged_table = " . $dbh2->quote($table) . " + WHERE staged_table = " . $dbh2->quote($data->{staged_table}) . " + ") || die "Error changing references to $data->{staged_table}: $!\n"; + my $rv4 = $dbh2->do(" + UPDATE $MIGSCHEMA.tracked_column + SET target_table = " . $dbh2->quote($table) . " + WHERE target_table = " . $dbh2->quote($data->{staged_table}) . " + ") || die "Error changing references to $data->{staged_table}: $!\n"; + Mig::db_disconnect($dbh2); + } + + print "running staging SQL: $stage_sql_filename\n"; + + system('psql', @args, '-f', $stage_sql_filename); + + if ($schema ne $MIGSCHEMA) { + die "Schema mismatch: env => $MIGSCHEMA sql => $schema\n"; + } + if (! Mig::check_db_migschema_for_specific_table($table)) { + die "Missing staged table: $schema_table\n"; + } else { + print "table staged: $schema_table\n"; + } + + my $dbh = Mig::db_connect(); + my $rv = $dbh->do(" + UPDATE $MIGSCHEMA.tracked_file + SET staged_table = " . $dbh->quote($table) . " + WHERE base_filename = " . $dbh->quote($file) . " + ; + ") || die "Error updating table $MIGSCHEMA.tracked_file: $!\n"; + Mig::db_disconnect($dbh); + } else { + print "File not currently tracked: $file\n"; + } +} diff --git a/mig-bin/mig-status b/mig-bin/mig-status new file mode 100755 index 0000000..0d78b18 --- /dev/null +++ b/mig-bin/mig-status @@ -0,0 +1,87 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-status - This will show tracking information for either the specified files +or all tracked files if no argument is given. + +You'll need to invoke B prior to using commands like B + +=head1 SYNOPSIS + +B [file] [...] + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if scalar(@ARGV) > 0 && $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +my @files = @ARGV; +if (scalar(@files) == 0) { + @files = (); + my $dbh = Mig::db_connect(); + my $sth = $dbh->prepare(" + SELECT base_filename + FROM $MIGSCHEMA.tracked_file + ORDER BY 1;" + ); + my $rv = $sth->execute() + || die "Error retrieving data from table (tracked_file): $!"; + my $rows = $sth->fetchall_arrayref; + for my $row ( @$rows ) { + push @files, $row->[0] + } + $sth->finish; + Mig::db_disconnect($dbh); +} + +foreach my $arg (sort @files) { + my $file = abs_path($arg); + my $data = Mig::status_this_file($file); + print "=-=-=\n"; + foreach my $key ( + 'base_filename' + ,'has_headers' + ,'headers_file' + ,'utf8_filename' + ,'clean_filename' + ,'parent_table' + ,'stage_sql_filename' + ,'staged_table' + ,'map_sql_filename' + ,'prod_sql_filename' + ) { + printf "%-20s:\t", $key; + print $data->{$key} ? $data->{$key} : ""; + if ($key =~ /filename$/ && $data->{$key} && ! -e $data->{$key}) { + print " (FILE MISSING)"; + } + print "\n"; + } +} + +exit 0; + +############################################################################### + + diff --git a/mig-bin/mig-unlink b/mig-bin/mig-unlink new file mode 100755 index 0000000..5bf34e4 --- /dev/null +++ b/mig-bin/mig-unlink @@ -0,0 +1,71 @@ +#!/usr/bin/perl -w +############################################################################### +=pod + +=head1 NAME + +mig-unlink + +Clear any association between the specified file and a parent table within the +migration schema. + +=head1 SYNOPSIS + +B + +=cut + +############################################################################### + +use strict; +use Switch; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use DBI; +use Cwd 'abs_path'; +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +use Mig; + +pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help'; + +Mig::die_if_no_env_migschema(); +Mig::die_if_mig_tracking_table_does_not_exist(); + +my $file = abs_path($ARGV[0]); +if ($file =~ /^$MIGBASEWORKDIR/) { + unlink_table(@ARGV); +} else { + print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n"; +} + +exit 0; + +############################################################################### + +sub unlink_table { + my $file = abs_path(shift); + + my $tracked_file_id = Mig::check_for_tracked_file($file); + if ($tracked_file_id) { + my $data = Mig::status_this_file($file); + my $table = $data->{'parent_table'} || ''; + + print "unlinking table ($table) from file: $file\n"; + + my $dbh = Mig::db_connect(); + my $rv = $dbh->do(" + UPDATE $MIGSCHEMA.tracked_file + SET parent_table = '' + WHERE base_filename = " . $dbh->quote($file) . " + ; + ") || die "Error updating table $MIGSCHEMA.tracked_file: $!\n"; + Mig::db_disconnect($dbh); + } else { + print "File not currently tracked: $file\n"; + } +} diff --git a/text/clean_csv b/text/clean_csv new file mode 100755 index 0000000..599a5d0 --- /dev/null +++ b/text/clean_csv @@ -0,0 +1,792 @@ +#!/usr/bin/perl -w +use Storable; +use Switch; +use Getopt::Long; +use Text::CSV_XS; +use Text::CSV::Separator qw(get_separator); +use Data::Dumper; +use Term::ANSIColor; + +# may be manipulated with --config +our %CSV_options = ( + binary => 1, + auto_diag => 1, + diag_verbose => 1, +); +my $csv; +our $fixes = { + 'R' => [], + 'I' => [], + 'D' => [] +}; +my @parsed_rows; +my @lines_with_errors = (); +my %line_numbers_for_lines_with_errors = (); +my $expected_column_count; +my $line_no; + +# GetOpt variables +my $config; +my $id_cols; +my $fix; +my $nosave; +my $apply; +my $pad; +my $truncate; +my $backslash; +my $debug; +my $help; +my $create_headers; +my $headers_file; +my @headers = (); + +my $pad_count = 0; my $trunc_count = 0; my $fix_count = 0; my $backslash_count = 0; + +################################################################## Subs + +sub format_for_display { + my $formatted_line = shift; + my $sep_char = $CSV_options{sep_char} || '\t'; + my $sep = color 'bold blue'; + $sep .= '<' . (ord($sep_char) < 32 ? ord($sep_char) : $sep_char) . '>'; + $sep .= color 'reset'; + my $quote_char = $CSV_options{quote_char} || ''; + my $quote = color 'bold red'; + $quote .= '<' . (ord($quote_char) < 32 ? ord($quote_char) : $quote_char) . '>'; + $quote .= color 'reset'; + my $escape_char = $CSV_options{escape_char} || ''; + my $escape = color 'bold green'; + $escape .= '<' . (ord($escape_char) < 32 ? ord($escape_char) : $escape_char) . '>'; + $escape .= color 'reset'; + my $real_escape_char = chr(27); + my $real_escape = color 'bold green'; + $real_escape .= '<27>'; + $real_escape .= color 'reset'; + + $formatted_line =~ s/$real_escape_char/$real_escape/g; + $formatted_line =~ s/$sep_char/$sep/g; + $formatted_line =~ s/$quote_char/$quote/g; + $formatted_line =~ s/$escape_char/$escape/g; + for (my $i = 0; $i < 32; $i++) { + if ($i == 27) { next; } + my $other_char = chr($i); + my $other = color 'yellow'; + $other .= "<$i>"; + $other .= color 'reset'; + $formatted_line =~ s/$other_char/$other/g; + } + return "$formatted_line\n"; +} + +sub combine_cols { + my $row = shift; + my $status = $csv->combine(@{ $row }); + if ($status && $csv->string) { + return $csv->string . "\n"; + } else { + die $csv->error_input . "\n"; + } +} + +sub convert_backslashes { + my $line = shift; + my $altered_line; + my @count = $line =~ /\\/g; + if (scalar(@count) > 0) { + my $csv2 = Text::CSV_XS->new(\%CSV_options); + if ($csv2->parse($line)) { + my @columns = $csv2->fields(); + foreach my $c (@columns) { + if ($c ne '\N') { + $c =~ s/\\/\//g; + } + } + $altered_line = combine_cols(\@columns); + } else { + $altered_line =~ s/\\/\//g; + } + if ($line ne $altered_line) { + $backslash_count += scalar(@count); + print "\nline#$line_no>> Converting " . scalar(@count) . " backslashes to forward slashes\n"; + print "before: " . format_for_display($line); + $line = $altered_line; + print " after: " . format_for_display($line); + } + } + return $line; +} + +sub apply_line_fixes { + my $line = shift; + foreach my $fix ( @{$fixes->{'R'}} ) { + my $id_regex = $fix->[0]; + if ($line =~ /$id_regex/) { + print "\nline#$line_no>> Applying regex fix for $id_regex\n"; + $fix_count++; + my $regex1 = $fix->[1]; + my $regex2 = $fix->[2]; + my $global = $fix->[3]; + my $ignore_case = $fix->[4]; + print "before: " . format_for_display($line); + $line = fix_via_regex($id_regex,$line,$regex1,$regex2,$global,$ignore_case); + print " after: " . format_for_display($line); + } + } + return $line; +} + +sub apply_insert_fixes { + my $line = shift; + my $cols = shift; + foreach my $fix ( @{$fixes->{'I'}} ) { + my $id_regex = $fix->[0]; + my $col_count_check = $fix->[1]; + if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) { + print "\nline#$line_no>> Applying insert for $id_regex\n"; + $fix_count++; + my $fix_cols = $fix->[2]; + my $fix_value = $fix->[3]; + print "before: " . format_for_display($line); + $line = fix_via_insert($id_regex,$cols,$col_count_check,$fix_cols,$fix_value); + print " after: " . format_for_display($line); + } + } + return $line; +} + +sub apply_delete_fixes { + my $line = shift; + my $cols = shift; + foreach my $fix ( @{$fixes->{'D'}} ) { + my $id_regex = $fix->[0]; + my $col_count_check = $fix->[1]; + if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) { + print "\nline#$line_no>> Applying delete for $id_regex\n"; + $fix_count++; + my $fix_cols = $fix->[2]; + print "before: " . format_for_display($line); + $line = fix_via_delete($id_regex,$cols,$col_count_check,$fix_cols); + print " after: " . format_for_display($line); + } + } + return $line; +} + +sub apply_join_fixes { + my $line = shift; + my $cols = shift; + foreach my $fix ( @{$fixes->{'J'}} ) { + my $id_regex = $fix->[0]; + my $col_count_check = $fix->[1]; + if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) { + print "\nline#$line_no>> Applying join for $id_regex\n"; + $fix_count++; + my $fix_cols = $fix->[2]; + print "before: " . format_for_display($line); + $line = fix_via_join($id_regex,$cols,$col_count_check,$fix_cols); + print " after: " . format_for_display($line); + } + } + return $line; +} + +sub save_fix { + print "saving fix..."; + my $type = shift; + my $fix = shift; + if ($nosave) { print "psyche!\n"; return; } + print "fix = " . Dumper($fix) . "\n" if $debug; + push @{$fixes->{$type}}, $fix; + store $fixes, $ARGV[0] . '.fixes'; + print "saved\n"; +} + +sub id_cols_regex { + my $cols = shift; + my @f = sort(split /,/, $id_cols || '0'); + my $regex = ''; + for (my $i = 0; $i < scalar(@f); $i++) { + if ($i > 0) { + $regex .= '.+'; # characters between id columns + } + $regex .= '.?' . $cols->[$f[$i]] . '.?'; + } + $regex .= ''; + return $regex; +} + +sub fix_via_regex { + my $id_regex = shift; + my $line = shift; + my $fix_regex1 = shift; + my $fix_regex2 = shift; + my $global = shift; + my $ignore_case = shift; + my $save; + + if (!$fix_regex1) { +global_prompt: + print "Global (aka s/match/replace/g)? [n] "; + $global = readline(STDIN); chomp $global; + $global = uc(substr($global,0,1)); + if ($global eq '') { + $global = 'N'; + } + $global = uc(substr($global,0,1)); + if ($global ne 'Y' && $global ne 'N') { + goto global_prompt; + } +case_prompt: + print "Ignore-case (aka s/match/replace/i)? [n] "; + $ignore_case = readline(STDIN); chomp $ignore_case; + $ignore_case = uc(substr($ignore_case,0,1)); + if ($ignore_case eq '') { + $ignore_case = 'N'; + } + $ignore_case = uc(substr($ignore_case,0,1)); + if ($ignore_case ne 'Y' && $ignore_case ne 'N') { + goto case_prompt; + } + +regex1_prompt: + print "Enter match regex for s/match/replace/: "; + $fix_regex1 = readline(STDIN); chomp $fix_regex1; + if ($fix_regex1 eq '') { + goto global_prompt; + } + if ( + ($global eq 'Y' && $ignore_case eq 'Y' && $line =~ /$fix_regex1/gi) + || ($global eq 'Y' && $ignore_case eq 'N' && $line =~ /$fix_regex1/g) + || ($global eq 'N' && $ignore_case eq 'N' && $line =~ /$fix_regex1/i) + ) { + print "Regex matches line.\n"; + } else { + print "Regex does not match line.\n"; + goto regex1_prompt; + } +regex2_prompt: + print "Enter replace regex for s/match/replace/: "; + $fix_regex2 = readline(STDIN); chomp $fix_regex2; + if (substr($fix_regex1,-1) eq '$') { + print "Adding new line to end of /$fix_regex2/ based on \$ in /$fix_regex1/\n"; + $fix_regex2 .= "\n"; + } + # TODO - how to do we handle backreferences with this? + $save = 1; + } + + switch ($global . $ignore_case) { + case 'YY' { $line =~ s/$fix_regex1/$fix_regex2/gi; } + case 'YN' { $line =~ s/$fix_regex1/$fix_regex2/g; } + case 'NY' { $line =~ s/$fix_regex1/$fix_regex2/i; } + case 'NN' { $line =~ s/$fix_regex1/$fix_regex2/; } + } + + if ($save) { + save_fix('R',[ + $id_regex, + $fix_regex1, + $fix_regex2, + $global, + $ignore_case + ]); + } + + return $line; +} + +sub fix_via_insert { + my $id_regex = shift; + my $cols = shift; + my $col_count_check = shift; + my $fix_cols = shift; + my $fix_value = shift; + my $line; + my $save; + + if (!$fix_cols) { + $col_count_check = scalar(@{$cols}); + print "This fix will only trigger when the number of columns is $col_count_check.\n"; + print "Enter value to insert: [] "; + $fix_value = readline(STDIN); chomp $fix_value; + print "Enter comma-separated list of column positions (0-based) for insertion: "; + $fix_cols = readline(STDIN); chomp $fix_cols; + $save = 1; + } + + if ($col_count_check != scalar(@{$cols})) { + print "WARNING: Insert column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ". Skipping.\n"; + return $line; + } + + my @f = sort(split /,/, $fix_cols); + for (my $i = 0; $i < scalar(@f); $i++) { + splice @{ $cols }, $f[$i] + $i, 0, $fix_value; + } + + eval { + $line = combine_cols($cols); + }; + if ($@) { + print "fix_via_insert error:\n"; + die $@; + } + + if ($save) { + save_fix('I',[ + $id_regex, + $col_count_check, + $fix_cols, + $fix_value + ]); + } + + return $line; +} + +sub fix_via_delete { + my $id_regex = shift; + my $cols = shift; + my $col_count_check = shift; + my $fix_cols = shift; + my $line; + my $save; + + if (!$fix_cols) { + $col_count_check = scalar(@{$cols}); + print "This fix will only trigger when the number of columns is $col_count_check.\n"; + print "Enter comma-separated list of column positions (0-based) to delete: "; + $fix_cols = readline(STDIN); chomp $fix_cols; + $save = 1; + } + + if ($col_count_check != scalar(@{$cols})) { + print "WARNING: Delete column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ". Skipping.\n"; + return $line; + } + + my @f = sort(split /,/, $fix_cols); + for (my $i = 0; $i < scalar(@f); $i++) { + splice @{ $cols }, $f[$i] - $i, 1; + } + + eval { + $line = combine_cols($cols); + }; + if ($@) { + print "fix_via_delete error:\n"; + die $@; + } + + if ($save) { + save_fix('D',[ + $id_regex, + $col_count_check, + $fix_cols + ]); + } + + return $line; +} + +sub fix_via_join { + my $id_regex = shift; + my $cols = shift; + my $col_count_check = shift; + my $fix_cols = shift; + my $line; + my $save; + + if (!$fix_cols) { + $col_count_check = scalar(@{$cols}); + print "This fix will only trigger when the number of columns is $col_count_check.\n"; + print "Enter comma-separated list of column positions (0-based) to join: "; + $fix_cols = readline(STDIN); chomp $fix_cols; + $save = 1; + } + + if ($col_count_check != scalar(@{$cols})) { + print "WARNING: Join column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ". Skipping.\n"; + return $line; + } + + # gather + my $fix_value = ''; + my @f = sort { $a <=> $b } (split /,/, $fix_cols); + for (my $i = 0; $i < scalar(@f); $i++) { + $fix_value .= @{ $cols }[$f[$i]]; + } + + # delete + for (my $i = 0; $i < scalar(@f); $i++) { + splice @{ $cols }, $f[$i] - $i, 1; + } + + # insert + splice @{ $cols }, $f[0], 0, $fix_value; + + eval { + $line = combine_cols($cols); + }; + if ($@) { + print "fix_via_join error:\n"; + die $@; + } + + if ($save) { + save_fix('J',[ + $id_regex, + $col_count_check, + $fix_cols + ]); + } + + return $line; +} + + +sub manual_fix { + my $line = shift; + my $cols = shift; + my $insert_delete_allowed = shift; + my $col_count = scalar(@{$cols}) > scalar(@headers) ? scalar(@{$cols}) : scalar(@headers); + my $max_header_length = 0; + + # display columns nice and formatted + for (my $i = 0; $i < scalar(@headers); $i++) { + if (length($headers[$i]) > $max_header_length) { + $max_header_length = length($headers[$i]); + } + } + for (my $i = 0; $i < $col_count; $i++) { + printf "#% d ", $i; + if (defined $headers[$i]) { + print " " x ($max_header_length - length($headers[$i])); + print $headers[$i]; + } else { + print " " x $max_header_length; + } + print ": "; + if (defined $cols->[$i]) { + print "<" . $cols->[$i] . ">"; + } + print "\n"; + } + + # prompt for type of fix +fix_prompt: + print "\n" . format_for_display($line) . "\nFix line#$line_no? [n] "; + my $ans = readline(STDIN); chomp $ans; + $ans = uc(substr($ans,0,1)); + if ($ans eq '') { + $ans = 'N'; + } + if ($insert_delete_allowed ? index("RIDJN",$ans)==-1 : index("RN",$ans)==-1) { + goto fix_prompt; + } + + # prompt for matching condition + my $id_regex; + if ($ans ne 'N') { + my $default_id_regex = id_cols_regex($cols); +match_prompt: + print "If matching the end of the string, you may need to use \\s*\$ instead of \$\n"; + print "Insert/Delete/Join fixes will also filter on column count.\n"; + print "Identify this line (and optionally similar lines) with regex: [$default_id_regex] "; + $id_regex = readline(STDIN); chomp $id_regex; + if ($id_regex eq '') { + $id_regex = $default_id_regex; + } + if ($line =~ /$id_regex/) { + print "Regex matches line.\n"; + } else { + print "Regex does not match line.\n"; + goto match_prompt; + } + } + + # prompt and perform actual fixes + switch($ans) { + case 'R' { $line = fix_via_regex($id_regex,$line); } + case 'I' { $line = fix_via_insert($id_regex,$cols); } + case 'D' { $line = fix_via_delete($id_regex,$cols); } + case 'J' { $line = fix_via_join($id_regex,$cols); } + case 'N' { } + else { $ans = 'N'; } + } + + if ($ans ne 'N') { + print "\nNew line#$line_no: $line"; + } + return ( $ans, $line ); +} + +################################################################## Init +GetOptions( + 'config=s' => \$config, + 'idcols=s' => \$id_cols, + 'create-headers' => \$create_headers, + 'use-headers=s' => \$headers_file, + 'fix' => \$fix, + 'nosave' => \$nosave, + 'apply' => \$apply, + 'pad' => \$pad, + 'truncate' => \$truncate, + 'backslash' => \$backslash, + 'debug' => \$debug, + 'help|?' => \$help +); +if ($help || ((@ARGV == 0) && (-t STDIN))) { + die "\n\t$0 [--config ] [--idcols ] [--fix] [--apply] [--pad] [--truncate] \n\n" + . "\tExpects to be a CSV-like UTF-8 encoded file.\n" + . "\tWill produce .clean and .error versions of said file.\n\n" + . "\t--config will read the Perl file for settings information. See 'Example Config' below\n\n" + . "\t--create-headers will generate headers like so: col1, col2, col3, etc.\n" + . "\t--use-headers will generate headers based on the specified , which must contain one column header per line.\n" + . "\t(if neither --create-headers nor --use-headers are specified, then the first line in is assumed to contain the column headers)\n\n" + . "\t--fix will prompt for whether and how to fix broken records, and save those fixes in .fixes\n" + . "\t--idcols takes a comma-separated list of column indexes (starting with 0) to use as matchpoint suggestions for fixes\n" + . "\t--nosave will prevent new fixes from being saved in .fixes\n" + . "\t--apply will apply previously recorded fixes from .fixes\n\n" + . "\t--pad will fill in missing columns at the end if needed for otherwise unbroken records\n" + . "\t--truncate will strip extra columns from the end if needed for otherwise unbroken records\n" + . "\t--backslash will convert backslashes into forward slashes\n\n" + . "\t Example Config:\n\n" + . "\t\t\$CSV_options{quote_char} = '\"';\n" + . "\t\t\$CSV_options{escape_char} = '\"';\n" + . "\t\t\$CSV_options{sep_char} = ',';\n" + . "\t\t\$CSV_options{eol} = \$\\;\n" + . "\t\t\$CSV_options{always_quote} = 0;\n" + . "\t\t\$CSV_options{quote_space} = 1;\n" + . "\t\t\$CSV_options{quote_null} = 1;\n" + . "\t\t\$CSV_options{quote_binary} = 1;\n" + . "\t\t\$CSV_options{binary} = 0;\n" + . "\t\t\$CSV_options{decode_utf8} = 1;\n" + . "\t\t\$CSV_options{keep_meta_info} = 0;\n" + . "\t\t\$CSV_options{allow_loose_quotes} = 0;\n" + . "\t\t\$CSV_options{allow_loose_escapes} = 0;\n" + . "\t\t\$CSV_options{allow_unquoted_escape} = 0;\n" + . "\t\t\$CSV_options{allow_whitespace} = 0;\n" + . "\t\t\$CSV_options{blank_is_undef} = 0;\n" + . "\t\t\$CSV_options{empty_is_undef} = 0;\n" + . "\t\t\$CSV_options{verbatim} = 0;\n" + . "\n\n"; +} +if (! -e $ARGV[0]) { + die "$ARGV[0] does not exist\n"; +} +if ($config && ! -e $config) { + die "$config does not exist\n"; +} +if ($apply && -e $ARGV[0] . '.fixes') { + $fixes = retrieve($ARGV[0] . '.fixes'); +} + +################################################################## CSV Setup +$CSV_options{sep_char} = get_separator( path => $ARGV[0], lucky => 1 ); +if ($config && -e $config) { + do $config; +} +$csv = Text::CSV_XS->new(\%CSV_options); +$csv->callbacks( + error => sub { + my ($err, $msg, $pos, $recno) = @_; + return if ($err == 2012); + $line_numbers_for_lines_with_errors{$line_no} = 1; + print "\nline#$line_no * $err : $msg -> (pos#$pos,rec#$recno)\n"; + if ($csv->error_input) { + print $csv->error_input; + print "-" x ($pos - 1); + print "^\n"; + } + $csv->SetDiag(0); + return; + } +); + +################################################################## Reading + +if ($headers_file) { + print "_.,-~= reading $headers_file\n"; + open my $hfile, "<:encoding(utf8)", $headers_file or die "$headers_file: $!"; + while (my $line = <$hfile>) { + chomp $line; + $line =~ s/\s+$//; + $line =~ s/^\s+//; + push @headers, $line; + } + close $hfile; + $expected_column_count = scalar(@headers); + print "Expected column count set to $expected_column_count based on headers.\n"; +} + +print "_.,-~= reading $ARGV[0]\n"; +open my $in, "<:encoding(utf8)", $ARGV[0] or die "$ARGV[0]: $!"; +$line_no = 1; +while (my $line = <$in>) { + print ">>> main loop (#$line_no): $line" if $debug; + if ($backslash) { + $line = convert_backslashes($line); + } + if ($apply) { + $line = apply_line_fixes($line); + } + if ($csv->parse($line)) { + my @columns = $csv->fields(); + if (! $expected_column_count) { + $expected_column_count = scalar(@columns); + print "Expected column count set to $expected_column_count based on first row.\n"; + for (my $i = 0; $i < scalar(@columns) ; $i++) { + if ($create_headers) { + push @headers, "col" . ($i+1); + } else { + push @headers, $columns[$i]; + } + } + } + if (defined $line_numbers_for_lines_with_errors{$line_no}) { + if ($fix) { + my $fix_status; + ($fix_status,$line) = manual_fix($line,\@columns,0); # Regex only + if ($fix_status ne 'N') { + delete $line_numbers_for_lines_with_errors{$line_no}; + $fix_count++; + redo; + } + } + } else { + if (scalar(@columns) < $expected_column_count) { + if ($apply) { + my $new_line = apply_insert_fixes($line,\@columns); + if ($line ne $new_line) { + $line = $new_line; + redo; + } + } + if ($pad) { + $pad_count++; + print "\nline#$line_no>> padding line, from " . scalar(@columns) . " columns "; + my $col_count = scalar(@columns); + for (my $i = 0; $i < $expected_column_count - $col_count; $i++) { + push @columns, '#pad#'; + } + print "to " . scalar(@columns) . " columns.\n"; + eval { + print "before: " . format_for_display($line); + $line = combine_cols(\@columns); + print " after: " . format_for_display($line); + }; + if ($@) { + print "padding error:\n"; + die $@; + } + redo; + } + } + if (scalar(@columns) > $expected_column_count) { + if ($apply) { + my $new_line = apply_delete_fixes($line,\@columns); + if ($line ne $new_line) { + $line = $new_line; + redo; + } + $new_line = apply_join_fixes($line,\@columns); + if ($line ne $new_line) { + $line = $new_line; + redo; + } + } + if ($truncate) { + $trunc_count++; + print "\nline#$line_no>> truncating line, from " . scalar(@columns) . " columns "; + splice @columns, $expected_column_count; + print "to " . scalar(@columns) . " columns.\n"; + eval { + print "before: " . format_for_display($line); + $line = combine_cols(\@columns); + print " after: " . format_for_display($line); + }; + if ($@) { + print "truncating error:\n"; + die $@; + } + redo; + } + } + if (scalar(@columns) != $expected_column_count) { + # so broken, but parseable, and thus not handled by the error callback + print "\nline#$line_no * Expected $expected_column_count columns but found " . scalar(@columns) . "\n$line"; + print "-" x length($line) . "\n"; + $line_numbers_for_lines_with_errors{$line_no} = 1; + if ($fix) { + my $fix_status; + ($fix_status,$line) = manual_fix($line,\@columns,1); # Insert/Delete allowed + if ($fix_status ne 'N') { + delete $line_numbers_for_lines_with_errors{$line_no}; + $fix_count++; + redo; + } + } + } + } + if (defined $line_numbers_for_lines_with_errors{$line_no}) { + print "\tIncrementing errors with line# $line_no\n" if $debug; + push @lines_with_errors, $line; + } else { + print "\tIncrementing clean with line# $line_no\n" if $debug; + push @parsed_rows, \@columns; + } + $line_no++; + } else { + die "Parsing error:\n" . $csv->error_input . "\n"; + } +} +close $in; +print "_.,-~= read " . ($line_no-1) . " records "; +print "(" . scalar(@lines_with_errors) . " with errors, $pad_count auto-padded, $trunc_count auto-truncated, $backslash_count backslashes converted, $fix_count manual fixes)\n"; + + +################################################################## Writing good CSV + +print "_.,-~= writing $ARGV[0].clean\n"; +open my $out, ">:encoding(utf8)", "$ARGV[0].clean" or die "$ARGV[0].clean: $!"; +$line_no = 1; +$actual_count = 0; +if ($create_headers || $headers_file) { + unshift @parsed_rows, \@headers; +} +foreach my $row (@parsed_rows) { + eval { + $line = combine_cols($row); + print $out $line; + }; + if ($@) { + print "error:\n"; + die $@; + } + $actual_count++; +} +close $out; +print "_.,-~= wrote " . ($actual_count) . " records\n"; + + +################################################################## Writing broken CSV + +print "_.,-~= writing $ARGV[0].error\n"; +open my $out2, ">:encoding(utf8)", "$ARGV[0].error" or die "$ARGV[0].error: $!"; +foreach my $row (@lines_with_errors) { + print $out2 $row; +} +close $out2; +print "_.,-~= wrote " . (scalar @lines_with_errors) . " records\n"; + + +################################################################## .no_headers version + +print "_.,-~= creating $ARGV[0].clean.no_headers\n"; + +print `tail -n +2 $ARGV[0].clean > $ARGV[0].clean.no_headers`; + +################################################################## Finished + +print "_.,-~= finished\n"; diff --git a/text/clean_csv.prereqs b/text/clean_csv.prereqs new file mode 100644 index 0000000..cca6957 --- /dev/null +++ b/text/clean_csv.prereqs @@ -0,0 +1,2 @@ +Text::CSV::Separator +Switch diff --git a/text/csv2sql b/text/csv2sql new file mode 100755 index 0000000..de73881 --- /dev/null +++ b/text/csv2sql @@ -0,0 +1,161 @@ +#!/usr/bin/perl -w +use Getopt::Long; +use Text::CSV::Auto; +use Data::Dumper; +use DBI; +use File::Basename; + +my $dbh; +my $cfg; +my $csv_config; + +sub init { + our %config; + do '/openils/conf/offline-config.pl'; + $dbh = DBI->connect( $config{dsn}, $config{usr}, $config{pw} ) or die $DBI::errstr; + $cfg = { + schema => 'm_foo', + auto_options => { + } + }; + our %CSV_options = ( + binary => 1, + auto_diag => 1, + diag_verbose => 1, + ); + $cfg->{auto_options}->{csv_options} = \%CSV_options; + + GetOptions( + 'config=s' => \$csv_config, + 'no-legacy-prefix' => \($cfg->{no_legacy_prefix}), + 'use-no-headers-file' => \($cfg->{use_no_headers_file}), + 'add-x-migrate' => \($cfg->{add_x_migrate}), + 'outfile=s' => \($cfg->{outfile}), + 'schema=s' => \($cfg->{schema}), + 'parent=s' => \($cfg->{parent}), + 'help|?' => \$help + ); + if ($help || ((@ARGV == 0) && (-t STDIN))) { + die qq^\n\t$0 [--config ] [--add-x-migrate] [--no-legacy-prefix] [--schema ] [--parent ] [--outfile ] <"clean" file from clean_csv script>\n\n^; + } + if ($csv_config && ! -e $csv_config) { + die "$csv_config does not exist\n"; + } + if ($csv_config && -e $csv_config) { + do $csv_config; + } + if (! -e $ARGV[0]) { + die "$ARGV[0] does not exist\n"; + } +} + +sub write_sql_sample { + my $cfg = shift; + my $info = shift; + my $fn = $cfg->{outfile} || $cfg->{auto_options}->{file} . '.sql'; + + print "\twriting $fn\n"; + local *SQL; + open SQL, ">$fn"; + print SQL "-- $cfg->{auto_options}->{file}\n/*\n"; + open IN, $cfg->{auto_options}->{file}; + foreach (1..5) { + my $line = ; + print SQL $line; + } + close IN; + print SQL "*/\n"; + return *SQL; +} + +sub write_sql_table { + my $sql = shift; + my $cfg = shift; + my $info = shift; + my $fn = $cfg->{auto_options}->{file}; + my @indices = (); + + print "\twriting table definition\n"; + if ($cfg->{parent}) { + $cfg->{table_name} = $cfg->{parent} . '_legacy'; + } else { + $cfg->{table_name} = lc(basename($fn)); $cfg->{table_name} =~ s/[\-\. ]/_/g; + } + print $sql "DROP TABLE IF EXISTS $cfg->{schema}.$cfg->{table_name};\n"; + print $sql "CREATE UNLOGGED TABLE $cfg->{schema}.$cfg->{table_name} (\n"; + my $idx = 0; + if ($cfg->{add_x_migrate}) { + print $sql " x_migrate BOOLEAN\n"; + $idx++; + push @indices, 'x_migrate'; + } + foreach my $column (@{ $info }) { + my $cn = $column->{'header'}; + if ($cn =~ /^x_/) { + push @indices, $cn; + } + my $col_info = Dumper($column); + $col_info =~ s/^\$VAR1 = //; + print $sql " " . ($idx++ ? ',' : ' '); + print $sql "l_" unless $cfg->{no_legacy_prefix} or $column->{'header'} =~ /^x_/ or $column->{'header'} =~ /^l_/; + print $sql "$cn " . ($cn eq 'x_eg_bib_id' ? 'BIGINT' : 'TEXT'); + print $sql " /*\n $col_info */\n"; + } + if ($cfg->{parent}) { + print $sql ') INHERITS (' . $cfg->{schema} . '.' . $cfg->{parent} . ");\n"; + } else { + print $sql ");\n"; + } + foreach my $cn (@indices) { + print $sql "CREATE INDEX ON $cfg->{schema}.$cfg->{table_name} ($cn);\n"; + } +} + +sub write_sql_loader { + my $sql = shift; + my $cfg = shift; + my $auto = shift; + my $info = shift; + my $fn = $cfg->{auto_options}->{file} . ($cfg->{use_no_headers_file} ? '.no_headers' : ''); + + print "\twriting copy statement\n"; + print $sql "\n\\COPY $cfg->{schema}.$cfg->{table_name} ("; + my $idx = 0; + foreach my $column (@{ $info }) { + print $sql ($idx++ ? ',' : ''); + print $sql "l_" unless $cfg->{no_legacy_prefix} or $column->{'header'} =~ /^x_/ or $column->{'header'} =~ /^l_/; + print $sql $column->{'header'}; + } + print $sql ") FROM '$fn'"; + if ($auto->csv->sep_char eq chr(9) && ! defined $auto->csv->quote_char && ! defined $auto->csv->escape_char) { + # true .tsv, don't treat as csv + } elsif ($auto->csv->sep_char eq chr(9)) { + # probably good enough .tsv, don't treat as csv + } else { + print $sql " WITH csv " . ($cfg->{use_no_headers_file} ? "" : "header"); + print $sql " delimiter " . $dbh->quote( $auto->csv->sep_char ) unless $dbh->quote( $auto->csv->sep_char ) eq 'NULL'; + print $sql " quote " . $dbh->quote( $auto->csv->quote_char ) unless $dbh->quote( $auto->csv->quote_char ) eq 'NULL'; + print $sql " escape " . $dbh->quote( $auto->csv->escape_char ) unless $dbh->quote( $auto->csv->escape_char ) eq 'NULL'; + } + print $sql "\n"; +} + +sub main { + init(); + foreach my $fn (@ARGV) { + print "processing $fn\n"; + $cfg->{auto_options}->{file} = $fn; + my $auto = Text::CSV::Auto->new($cfg->{auto_options}); + + my $info = $auto->analyze(); + my $sql = write_sql_sample($cfg,$info); + write_sql_table($sql,$cfg,$info); + write_sql_loader($sql,$cfg,$auto,$info); + close $sql; + + print "\tdone.\n"; + } +} + +main(); + diff --git a/text/csv2sql.prereqs b/text/csv2sql.prereqs new file mode 100644 index 0000000..6f98b56 --- /dev/null +++ b/text/csv2sql.prereqs @@ -0,0 +1 @@ +Text::CSV::Auto diff --git a/text/csvcat b/text/csvcat new file mode 100755 index 0000000..4f91b13 --- /dev/null +++ b/text/csvcat @@ -0,0 +1,71 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long; + +my $help; +my $headers = 1; +my $sourcecolumn = 1; +my $sourcetype = 'partial'; +my $first_headers; +my $use_tab = 1; +my $use_comma = 0; + +GetOptions( + 'headers!' => \$headers, + 'source!' => \$sourcecolumn, + 'source-type=s' => \$sourcetype, + 'use-tab' => \$use_tab, + 'use-comma' => \$use_comma, + 'help|?' => \$help +); +if ($use_comma) { + $use_tab = 0; +} +if ($help || @ARGV == 0 || ($sourcetype ne 'partial' && $sourcetype ne 'full')) { + print "$0 <--headers|--noheaders> <--source|--nosource> <--source-type=full|--source-type=partial> <--use-tab|--use-comma> [file1] [file2] \n"; + exit 0; +} + +sub munged_source { + my $fn = shift; + my $s = $fn; + if ($sourcetype eq 'partial') { + my @f = split(/\//, $fn); + $s = $f[0]; + } + return "$s" . ($use_tab ? "\t" : ','); +} + +sub cat_file { + my $fn = shift; + open FILE, "$fn"; + if ($headers) { + if ($fn ne $ARGV[0]) { + my $check = ; # check and throw away headers for subsequent files + if ($check ne $first_headers) { + print STDERR "Mismatched headers between $ARGV[0] and $fn\n"; + print STDERR "$first_headers\n$check\n"; + exit 1; + } + } else { + print "x_source" . ($use_tab ? "\t" : ',') if $sourcecolumn; # potential column header + $first_headers = ; + print $first_headers; + } + } + while (my $line = ) { + print munged_source($fn) if $sourcecolumn; + print $line; + } + close FILE; +} + +my @files = @ARGV; +foreach my $file (@files) { + cat_file($file); +} + + + + diff --git a/text/examples/FolletDestinyPatronExport.headers b/text/examples/FolletDestinyPatronExport.headers new file mode 100644 index 0000000..fb7c995 --- /dev/null +++ b/text/examples/FolletDestinyPatronExport.headers @@ -0,0 +1,37 @@ +Barcode +unknown +Last name +First name +Middle name +Expiration date +unknown +Birth date +Gender +Patron Type +Status +User defined 1 +User defined 2 +User defined 3 +User defined 4 +Address line 1 +Address line 2 +City +State +Zip +Email 1 +Phone 1 +Phone 2 +Secondary address line 1 +Secondary address line 2 +Secondary city +Secondary state +Secondary zip +Email 2 +Secondary phone 1 +Secondary phone 2 +unknown +Grade level +unknown +Nickname +Acceptable use policy on file +unknown diff --git a/text/examples/HOLDINGS-MULT.headers b/text/examples/HOLDINGS-MULT.headers new file mode 100644 index 0000000..b54a4a5 --- /dev/null +++ b/text/examples/HOLDINGS-MULT.headers @@ -0,0 +1,3 @@ +x_egid +x_hseq +l_value diff --git a/text/examples/csv.clean.conf b/text/examples/csv.clean.conf new file mode 100644 index 0000000..2abfbc6 --- /dev/null +++ b/text/examples/csv.clean.conf @@ -0,0 +1,3 @@ +$CSV_options{quote_char} = '"'; +$CSV_options{escape_char} = '"'; +$CSV_options{sep_char} = ','; diff --git a/text/examples/destiny_852.map b/text/examples/destiny_852.map new file mode 100644 index 0000000..8307c93 --- /dev/null +++ b/text/examples/destiny_852.map @@ -0,0 +1,12 @@ +price 852 9 +library 852 a +location 852 b +call_number 852 h +cn_item_number 852 i +cn_prefix 852 k +cn_suffix 852 m +barcode 852 p +copy_number 852 t +item_note 852 x m:multi +extra1 926 a +extra2 926 b diff --git a/text/examples/horizon_949.map b/text/examples/horizon_949.map new file mode 100644 index 0000000..69f9496 --- /dev/null +++ b/text/examples/horizon_949.map @@ -0,0 +1,24 @@ +t949a 949 a +t949b 949 b +t949c 949 c +t949d 949 d +t949e 949 e +t949f 949 f +t949g 949 g +t949h 949 h +t949i 949 i +t949j 949 j +t949k 949 k +t949n 949 n +t949o 949 o +t949p 949 p +t949q 949 q +t949r 949 r +t949s 949 s +t949t 949 t +t949u 949 u +t949v 949 v +t949w 949 w +t949x 949 x +t949y 949 y +t949z 949 z diff --git a/text/examples/tabs.clean.conf b/text/examples/tabs.clean.conf new file mode 100644 index 0000000..b92b9d0 --- /dev/null +++ b/text/examples/tabs.clean.conf @@ -0,0 +1,3 @@ +$CSV_options{quote_char} = undef; +$CSV_options{escape_char} = undef; +$CSV_options{sep_char} = chr(9); diff --git a/text/fix_split_csv b/text/fix_split_csv new file mode 100755 index 0000000..64d2182 --- /dev/null +++ b/text/fix_split_csv @@ -0,0 +1,20 @@ +#!/usr/bin/perl + +my $delimiter = "\t"; +my $colcount = 11; + +my $running_field_count = 0; + +while (my $line = <>) { + chomp $line; + print $line; + my @f = split /$delimiter/, $line; + if (scalar(@f) != 1) { + $running_field_count += scalar(@f); + } + if ($running_field_count >= $colcount) { + $running_field_count = 0; + print "\n"; + } +} + diff --git a/text/html2csv.py b/text/html2csv.py new file mode 100755 index 0000000..da047d8 --- /dev/null +++ b/text/html2csv.py @@ -0,0 +1,139 @@ +#!/usr/bin/python +# -*- coding: iso-8859-1 -*- +# Hello, this program is written in Python - http://python.org +programname = 'html2csv - version 2002-09-20 - http://sebsauvage.net' + +import sys, getopt, os.path, glob, HTMLParser, re + +try: import psyco ; psyco.jit() # If present, use psyco to accelerate the program +except: pass + +def usage(progname): + ''' Display program usage. ''' + progname = os.path.split(progname)[1] + if os.path.splitext(progname)[1] in ['.py','.pyc']: progname = 'python '+progname + return '''%s +A coarse HTML tables to CSV (Comma-Separated Values) converter. + +Syntax : %s source.html + +Arguments : source.html is the HTML file you want to convert to CSV. + By default, the file will be converted to csv with the same + name and the csv extension (source.html -> source.csv) + You can use * and ?. + +Examples : %s mypage.html + : %s *.html + +This program is public domain. +Author : Sebastien SAUVAGE + http://sebsauvage.net +''' % (programname, progname, progname, progname) + +class html2csv(HTMLParser.HTMLParser): + ''' A basic parser which converts HTML tables into CSV. + Feed HTML with feed(). Get CSV with getCSV(). (See example below.) + All tables in HTML will be converted to CSV (in the order they occur + in the HTML file). + You can process very large HTML files by feeding this class with chunks + of html while getting chunks of CSV by calling getCSV(). + Should handle badly formated html (missing , , , + extraneous , ...). + This parser uses HTMLParser from the HTMLParser module, + not HTMLParser from the htmllib module. + Example: parser = html2csv() + parser.feed( open('mypage.html','rb').read() ) + open('mytables.csv','w+b').write( parser.getCSV() ) + This class is public domain. + Author: Sébastien SAUVAGE + http://sebsauvage.net + Versions: + 2002-09-19 : - First version + 2002-09-20 : - now uses HTMLParser.HTMLParser instead of htmllib.HTMLParser. + - now parses command-line. + To do: + - handle
 tags
+            - convert html entities (&name; and &#ref;) to Ascii.
+            '''
+    def __init__(self):
+        HTMLParser.HTMLParser.__init__(self)
+        self.CSV = ''      # The CSV data
+        self.CSVrow = ''   # The current CSV row beeing constructed from HTML
+        self.inTD = 0      # Used to track if we are inside or outside a ... tag.
+        self.inTR = 0      # Used to track if we are inside or outside a ... tag.
+        self.re_multiplespaces = re.compile('\s+')  # regular expression used to remove spaces in excess
+        self.rowCount = 0  # CSV output line counter.
+    def handle_starttag(self, tag, attrs):
+        if   tag == 'tr': self.start_tr()
+        elif tag == 'td': self.start_td()
+    def handle_endtag(self, tag):
+        if   tag == 'tr': self.end_tr()
+        elif tag == 'td': self.end_td()         
+    def start_tr(self):
+        if self.inTR: self.end_tr()  #  implies 
+        self.inTR = 1
+    def end_tr(self):
+        if self.inTD: self.end_td()  #  implies 
+        self.inTR = 0            
+        if len(self.CSVrow) > 0:
+            self.CSV += self.CSVrow[:-1]
+            self.CSVrow = ''
+        self.CSV += '\n'
+        self.rowCount += 1
+    def start_td(self):
+        if not self.inTR: self.start_tr() #  implies 
+        self.CSVrow += '"'
+        self.inTD = 1
+    def end_td(self):
+        if self.inTD:
+            self.CSVrow += '",'  
+            self.inTD = 0
+    def handle_data(self, data):
+        if self.inTD:
+            self.CSVrow += self.re_multiplespaces.sub(' ',data.replace('\t',' ').replace('\n','').replace('\r','').replace('"','""'))
+    def getCSV(self,purge=False):
+        ''' Get output CSV.
+            If purge is true, getCSV() will return all remaining data,
+            even if  or  are not properly closed.
+            (You would typically call getCSV with purge=True when you do not have
+            any more HTML to feed and you suspect dirty HTML (unclosed tags). '''
+        if purge and self.inTR: self.end_tr()  # This will also end_td and append last CSV row to output CSV.
+        dataout = self.CSV[:]
+        self.CSV = ''
+        return dataout
+
+
+if __name__ == "__main__":
+    try: # Put getopt in place for future usage.
+        opts, args = getopt.getopt(sys.argv[1:],None)
+    except getopt.GetoptError:
+        print usage(sys.argv[0])  # print help information and exit:
+        sys.exit(2)
+    if len(args) == 0:
+        print usage(sys.argv[0])  # print help information and exit:
+        sys.exit(2)       
+    print programname
+    html_files = glob.glob(args[0])
+    for htmlfilename in html_files:
+        outputfilename = os.path.splitext(htmlfilename)[0]+'.csv'
+        parser = html2csv()
+        print 'Reading %s, writing %s...' % (htmlfilename, outputfilename)
+        try:
+            htmlfile = open(htmlfilename, 'rb')
+            csvfile = open( outputfilename, 'w+b')
+            data = htmlfile.read(8192)
+            while data:
+                parser.feed( data )
+                csvfile.write( parser.getCSV() )
+                sys.stdout.write('%d CSV rows written.\r' % parser.rowCount)
+                data = htmlfile.read(8192)
+            csvfile.write( parser.getCSV(True) )
+            csvfile.close()
+            htmlfile.close()
+        except:
+            print 'Error converting %s        ' % htmlfilename
+            try:    htmlfile.close()
+            except: pass
+            try:    csvfile.close()
+            except: pass
+    print 'All done.                                      '
diff --git a/text/html2tsv.py b/text/html2tsv.py
new file mode 100755
index 0000000..97901b2
--- /dev/null
+++ b/text/html2tsv.py
@@ -0,0 +1,139 @@
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+# Hello, this program is written in Python - http://python.org
+programname = 'html2tsv - version 2002-09-20 - http://sebsauvage.net'
+
+import sys, getopt, os.path, glob, HTMLParser, re
+
+try:    import psyco ; psyco.jit()  # If present, use psyco to accelerate the program
+except: pass
+
+def usage(progname):
+    ''' Display program usage. '''
+    progname = os.path.split(progname)[1]
+    if os.path.splitext(progname)[1] in ['.py','.pyc']: progname = 'python '+progname
+    return '''%s
+A coarse HTML tables to TSV (Tab-Separated Values) converter.
+
+Syntax    : %s source.html
+
+Arguments : source.html is the HTML file you want to convert to TSV.
+            By default, the file will be converted to tsv with the same
+            name and the tsv extension (source.html -> source.tsv)
+            You can use * and ?.
+
+Examples   : %s mypage.html
+           : %s *.html
+
+This program is public domain.
+Author : Sebastien SAUVAGE 
+         http://sebsauvage.net
+''' % (programname, progname, progname, progname)
+
+class html2tsv(HTMLParser.HTMLParser):
+    ''' A basic parser which converts HTML tables into TSV.
+        Feed HTML with feed(). Get TSV with getTSV(). (See example below.)
+        All tables in HTML will be converted to TSV (in the order they occur
+        in the HTML file).
+        You can process very large HTML files by feeding this class with chunks
+        of html while getting chunks of TSV by calling getTSV().
+        Should handle badly formated html (missing , , ,
+        extraneous , ...).
+        This parser uses HTMLParser from the HTMLParser module,
+        not HTMLParser from the htmllib module.
+        Example: parser = html2tsv()
+                 parser.feed( open('mypage.html','rb').read() )
+                 open('mytables.tsv','w+b').write( parser.getTSV() )
+        This class is public domain.
+        Author: Sébastien SAUVAGE 
+                http://sebsauvage.net
+        Versions:
+           2002-09-19 : - First version
+           2002-09-20 : - now uses HTMLParser.HTMLParser instead of htmllib.HTMLParser.
+                        - now parses command-line.
+        To do:
+            - handle 
 tags
+            - convert html entities (&name; and &#ref;) to Ascii.
+            '''
+    def __init__(self):
+        HTMLParser.HTMLParser.__init__(self)
+        self.TSV = ''      # The TSV data
+        self.TSVrow = ''   # The current TSV row beeing constructed from HTML
+        self.inTD = 0      # Used to track if we are inside or outside a ... tag.
+        self.inTR = 0      # Used to track if we are inside or outside a ... tag.
+        self.re_multiplespaces = re.compile('\s+')  # regular expression used to remove spaces in excess
+        self.rowCount = 0  # TSV output line counter.
+    def handle_starttag(self, tag, attrs):
+        if   tag == 'tr': self.start_tr()
+        elif tag == 'td': self.start_td()
+    def handle_endtag(self, tag):
+        if   tag == 'tr': self.end_tr()
+        elif tag == 'td': self.end_td()         
+    def start_tr(self):
+        if self.inTR: self.end_tr()  #  implies 
+        self.inTR = 1
+    def end_tr(self):
+        if self.inTD: self.end_td()  #  implies 
+        self.inTR = 0            
+        if len(self.TSVrow) > 0:
+            self.TSV += self.TSVrow[:-1]
+            self.TSVrow = ''
+        self.TSV += '\n'
+        self.rowCount += 1
+    def start_td(self):
+        if not self.inTR: self.start_tr() #  implies 
+        self.TSVrow += ''
+        self.inTD = 1
+    def end_td(self):
+        if self.inTD:
+            self.TSVrow += '\t'  
+            self.inTD = 0
+    def handle_data(self, data):
+        if self.inTD:
+            self.TSVrow += self.re_multiplespaces.sub(' ',data.replace('\t',' ').replace('\n','').replace('\r','').replace('"','""'))
+    def getTSV(self,purge=False):
+        ''' Get output TSV.
+            If purge is true, getTSV() will return all remaining data,
+            even if  or  are not properly closed.
+            (You would typically call getTSV with purge=True when you do not have
+            any more HTML to feed and you suspect dirty HTML (unclosed tags). '''
+        if purge and self.inTR: self.end_tr()  # This will also end_td and append last TSV row to output TSV.
+        dataout = self.TSV[:]
+        self.TSV = ''
+        return dataout
+
+
+if __name__ == "__main__":
+    try: # Put getopt in place for future usage.
+        opts, args = getopt.getopt(sys.argv[1:],None)
+    except getopt.GetoptError:
+        print usage(sys.argv[0])  # print help information and exit:
+        sys.exit(2)
+    if len(args) == 0:
+        print usage(sys.argv[0])  # print help information and exit:
+        sys.exit(2)       
+    print programname
+    html_files = glob.glob(args[0])
+    for htmlfilename in html_files:
+        outputfilename = os.path.splitext(htmlfilename)[0]+'.tsv'
+        parser = html2tsv()
+        print 'Reading %s, writing %s...' % (htmlfilename, outputfilename)
+        try:
+            htmlfile = open(htmlfilename, 'rb')
+            tsvfile = open( outputfilename, 'w+b')
+            data = htmlfile.read(8192)
+            while data:
+                parser.feed( data )
+                tsvfile.write( parser.getTSV() )
+                sys.stdout.write('%d TSV rows written.\r' % parser.rowCount)
+                data = htmlfile.read(8192)
+            tsvfile.write( parser.getTSV(True) )
+            tsvfile.close()
+            htmlfile.close()
+        except:
+            print 'Error converting %s        ' % htmlfilename
+            try:    htmlfile.close()
+            except: pass
+            try:    tsvfile.close()
+            except: pass
+    print 'All done.                                      '
diff --git a/text/join_lines b/text/join_lines
new file mode 100755
index 0000000..91c08e5
--- /dev/null
+++ b/text/join_lines
@@ -0,0 +1,100 @@
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+join_lines - program for joining adjacent lines from stdin or one or more files
+
+=head1 SYNOPSIS
+
+B [options...] [text files...]
+
+=head1 DESCRIPTION
+
+B is used to join adjacent lines from stdin or one or more files,
+trimming trailing line feeds and carriage returns, and sending the output to
+stdout. Arguments are used to define the criteria and exact behavior.
+
+=head1 OVERVIEW
+
+B understands the following optional arguments:
+
+=over 15
+
+=item --help
+
+Show this documentation.
+
+=item --delimiter=
+
+B will use the specified delimiter  to separate joined lines. The
+default is to use no delmiter when joining lines.
+
+=item --lines=
+
+This will cause B to join every  lines into one line.
+
+=item --record-separator=
+
+This will cause B to immediately start a new line when it encounters
+a line solely containing . That line itself will not otherwise be used. This
+argument may be used in conjunction with --lines, but probably should not be. :)
+
+=item --record-separator-empty-line
+
+This works like --record-separator, but is a way to define an "empty" line as
+being the record separator.
+
+=back
+
+=cut
+###############################################################################
+
+use strict;
+use Pod::Usage;
+use Getopt::Long;
+
+my $help;
+my $delimiter;
+my $linecount;
+my $record_separator;
+my $record_separator_empty_line;
+
+GetOptions(
+	'delimiter=s' => \$delimiter,
+	'lines=s' => \$linecount,
+	'record-separator=s' => \$record_separator,
+	'record-separator-empty-line' => \$record_separator_empty_line,
+	'help|?' => \$help
+);
+pod2usage(-verbose => 2) if $help; 
+
+my $count = 0;
+my @lines = ();
+
+sub join_lines {
+    print join($delimiter || '', @lines) . "\n";
+    @lines = ();
+    $count = 0;
+}
+
+while (my $line = <>) {
+    $count++;
+    $line =~ s/[\r\n]+$//g;
+    if (defined $record_separator_empty_line && $line eq '') {
+        join_lines();
+    } elsif (defined $record_separator && $line eq $record_separator) {
+        join_lines();
+    } elsif (defined $linecount && $count == $linecount) {
+        push @lines, $line;
+        join_lines();
+    } elsif (! defined $linecount && ! defined $record_separator && ! defined $record_separator_empty_line) {
+        print "$line\n"; # passthru when given no arguments
+    } else {
+        push @lines, $line;
+    }
+}
+if (scalar(@lines) > 0) {
+    print join($delimiter || '', @lines) . "\n";
+}
diff --git a/text/join_lines.one-off.001 b/text/join_lines.one-off.001
new file mode 100755
index 0000000..758beb1
--- /dev/null
+++ b/text/join_lines.one-off.001
@@ -0,0 +1,6 @@
+#!/usr/bin/perl -w
+# so we're wanting our lines to end in <13><10>, ideally, and this is catching lines where isolated <10>'s cause a premature linebreak
+while (my $line = <>) {
+    $line =~ s/([^\x0D])\x0A$/$1/;
+    print $line;
+}
diff --git a/text/join_lines_if_short b/text/join_lines_if_short
new file mode 100755
index 0000000..1400104
--- /dev/null
+++ b/text/join_lines_if_short
@@ -0,0 +1,87 @@
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+join_lines_if_short - program for joining some adjacent lines from stdin or one
+or more files
+
+=head1 SYNOPSIS
+
+B [options...] [text files...]
+
+=head1 DESCRIPTION
+
+B is used to join adjacent lines from stdin or one or more
+files, trimming trailing line feeds and carriage returns, and sending the output
+to stdout.  Lines are only joined if the first line is short the expected number
+of columns.
+
+=head1 OVERVIEW
+
+B understands the following optional arguments:
+
+=over 15
+
+=item --help
+
+Show this documentation.
+
+=item --delimiter=
+
+B will use the specified delimiter  for determining the
+column count for each line.  The default is to assume tab as the delimiter.
+
+=item --join_delimiter=
+
+B will use the specified delimiter  when joining lines.
+The default is to use no delimiter.
+
+=item --columns=
+
+B will expect each line to contain  columns.  If a line
+has fewer than  columns, then this is the trigger for joining that line with
+the next line.  The new line will be reconsidered and potentially joined with
+the next line and so on.
+
+=back
+
+=cut
+###############################################################################
+
+use strict;
+use Pod::Usage;
+use Getopt::Long;
+
+my $help;
+my $delimiter = "\t";
+my $join_delimiter = "";
+my $colcount;
+
+GetOptions(
+	'delimiter=s' => \$delimiter,
+	'join_delimiter=s' => \$join_delimiter,
+	'columns=s' => \$colcount,
+	'help|?' => \$help
+);
+pod2usage(-verbose => 2) if $help || ! defined $colcount; 
+
+my $line_buffer = '';
+while (my $line = <>) {
+    chomp $line;
+    if ($line_buffer eq '') {
+        $line_buffer = $line;
+    } else {
+        $line_buffer = "$line_buffer$join_delimiter$line";
+    }
+    my @f = split /$delimiter/, $line_buffer;
+    if (scalar(@f) >= $colcount) {
+        $line_buffer =~ s/\x0D//g; # strip embedded carriage returns
+        print "$line_buffer\n";
+        $line_buffer = '';
+    }
+}
+if ($line_buffer ne '') {
+    print "$line_buffer\n";
+}
diff --git a/text/split_body_from_headers b/text/split_body_from_headers
new file mode 100755
index 0000000..7e2b3f4
--- /dev/null
+++ b/text/split_body_from_headers
@@ -0,0 +1,13 @@
+#!/bin/bash
+START=$2
+if [ ! $START ]; then
+    START=1;
+fi
+tail -n +$START $1 | head -1 > $1.headers
+echo "wrote $1.headers (line $START)"
+tail -n +`expr $START + 1` $1 > $1.no_headers
+echo "wrote $1.no_headers (after line $START)"
+if [ "$START" -gt "1" ]; then
+    tail -n +$START $1 > $1.with_headers
+    echo "wrote $1.with_headers (starting with line $START)"
+fi
diff --git a/text/strip_cm_tabs b/text/strip_cm_tabs
new file mode 100755
index 0000000..13479c2
--- /dev/null
+++ b/text/strip_cm_tabs
@@ -0,0 +1,6 @@
+#!/usr/bin/perl -w
+while (my $line = <>) {
+    chomp $line;
+    $line =~ s/\x0D\x09/_/g;
+    print "$line\n";
+}
-- 
1.7.2.5