--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig - git-like program for tracking and manipulating legacy data files for
+migrations
+
+=head1 SYNOPSIS
+
+B<mig> <command> [argument] [...]
+
+=head1 DESCRIPTION
+
+B<mig> is used to track and manipulate CSV or CSV-like text files exported from
+legacy systems for migration into Evergreen. It can be a wrapper for some
+other migration tools and tracks state using a PostgreSQL table in a given
+migration schema.
+
+It makes use of certain environment variables that may be set by the B<mig-env>
+tool: PGHOST, PGPORT, PGUSER, PGDATABASE, MIGSCHEMA, and MIGWORKDIR
+
+For most commands, if the current working directory falls outside of the
+directory specified by MIGWORKDIR, then mig will assume that environment is
+also incorrect and bail before doing any actual work.
+
+~/.pgpass should also be configured, as B<mig> will not prompt for a database
+password.
+
+Only the B<env> and B<help> commands work without the MIGSCHEMA environment
+variable being set.
+
+=head1 OVERVIEW
+
+Using B<mig> should go something like this:
+
+=over 15
+
+=item mig env create m_foo # Sets up the environment
+
+=item mig env use m_foo # Spawns a shell using the configured environment
+
+=item mig init # creates the m_foo schema in the database if needed, and other tables
+
+=item mig add patrons.tsv # tracks an incoming data file; repeat for additional files
+
+=item mig iconv patrons.tsv # convert it to UTF8, creating patrons.tsv.utf8
+
+=item mig clean patrons.tsv # cleans the file, creating patrons.tsv.utf8.clean
+
+=item mig link patrons.tsv actor_usr # makes the soon-to-be staging table a child of m_foo.actor_usr
+
+=item mig convert patrons.tsv # creates a .sql file for staging the data
+
+=item mig stage patrons.tsv # load said .sql file
+
+=item mig mapper patrons.tsv # interactive tool for analyzing/mapping the staging table
+
+=item mig analysis patrons.tsv # writes a summary .tsv file of mapped/flagged fields from the staging table
+
+=item mig map patrons.tsv # apply configured mappings
+
+=item mig write_prod patrons.tsv # creates a .sql file for pushing the staging data into production
+
+=back
+
+=head1 COMMANDS
+
+=over 15
+
+=item B<help> [command]
+
+Display this very same documentation, or specific documentation for one of the
+commands listed here.
+
+=item B<env> <create|use|show> <schema>
+
+Invokes B<mig-env> with the same arguments. I<mig-env> can set important
+environment variables and spawn a shell with those variables, and it also does
+some directory creation and symlinking.
+
+=item B<init>
+
+Create or re-create the PostgreSQL tracking table for the schema specified by
+the MIGSCHEMA environment variable. If needed, create the migration schema
+itself and run migration_tools.init() and build() if the migration_tools schema
+exists.
+
+=item B<status> [file] [...]
+
+Show status information for either the specified files or all tracked files if
+no argument is given.
+
+=item B<add> [--no-headers|--headers] <file> [file|--no-headers|--headers] [...]
+
+Add the specified files to the migration tracker. Until --no-headers is
+specified, the tracker will assume the files have headers.
+
+You can do crazy stuff like
+B<mig add file1 --no-headers file2 file3 --headers file4>
+
+=item B<remove> <file> [file] [...]
+
+Remove the specified files from the migration tracker.
+
+=item B<iconv> <file> [other arguments...]
+
+Attempts to invoke B<iconv> on the specified tracked file, placing the output in
+<file>.utf8
+
+If given no other arguments, the invocation will lool like
+
+=over 5
+
+iconv -f ISO-8859-1 -t UTF-8 -o <file>.utf8 <file>
+
+=back
+
+otherwise, the arguments will be passed through like so
+
+=over 5
+
+iconv [other arguments...] -o <file>.utf8 <file>
+
+=back
+
+=item B<skip-iconv> <file>
+
+If this is used instead of B<iconv>, then B<mig> will look for an existing
+<file>.utf8 and use it instead of attempting to create one.
+
+=item B<clean> <file> [other arguments...]
+
+Attempts to invoke B<clean_csv> on the iconv-converted specified tracked file,
+placing the output in <file>.utf8.clean
+
+If given no other arguments, the invocation will lool like
+
+=over 5
+
+clean_csv --config scripts/clean.conf --fix --apply <--create-headers> <file>
+
+=back
+
+otherwise, the arguments will be passed through like so
+
+=over 5
+
+clean_csv [other arguments...] <file>
+
+=back
+
+=item B<skip-clean> <file>
+
+If this is used instead of B<clean>, then B<mig> will look for an existing
+<file>.utf8.clean and use it instead of attempting to create one.
+
+=item B<link> <file> <parent table>
+
+Associate the specified file with a parent table within the migration schema.
+
+Linking multiple files to the same parent table is not allowed currently.
+
+=item B<unlink> <file>
+
+Removes any association between the specified file and a parent table within
+the migration schema.
+
+=item B<convert> <file>
+
+Attempts to invoke B<csv2sql> on the .utf8.clean version of the specified
+tracked file, creating either [file].utf8.clean.stage.sql or
+<parent table>_stage.sql depending on whether the file has been linked to a
+parent table within the migration schema or not.
+
+If given no other arguments, the invocation will lool like
+
+=over 5
+
+csv2sql --config scripts/clean.conf --add-x-migrate --schema <MIGSCHEMA> [--parent <PARENT TABLE>] -o <[<file>.utf8.clean.stage.sql]|[parent_table_stage.sql]> <FILE>.utf8.clean
+
+=back
+
+otherwise, the arguments will be passed through like so
+
+=over 5
+
+csv2sql [other arguments...] -o <[<file>.utf8.clean.stage.sql]|[parent_table_stage.sql]> <file>.utf8.clean
+
+=back
+
+=item B<stage> <file> [other arguments...]
+
+Load the SQL-converted version of the specified file into the migration schema.
+
+Extra arguments are passed to the underlying call to psql
+
+=item B<mapper> <file>
+
+Interactive session for analyzing, flagging, and mapping legacy field data to
+Evergreen fields.
+
+Upon exit, generate either [file].clean.map.sql or <parent table>_map.sql. The
+SQL generated will be UPDATE's for setting the Evergreen-specific columns for a
+given file's staging tables, and TRUNCATE's and INSERT's for auxilary tables.
+The files will have \include hooks for pulling in additional mapping files
+(for example, end-user mappings for circ modifiers, etc.)
+
+=item B<analysis> [file]
+
+Writes a MIGSCHEMA.tsv file containing a break-down of mapped and flagged
+fields from the specified file, or all staged files if no file is specified.
+
+The main goal of the tsv file is to present end-user mappable data for circ
+modifiers, shelving locations, patron profiles, etc. We use spreadsheets for
+this now but may move to a dedicated UI in the future.
+
+=item B<map> [file]
+
+Applies the mapping sql to the migration schema for the specified mapped file,
+or for all mapped files if no file is specified.
+
+=item B<write_prod> [file]
+
+Generates <parent table>_prod.sql for the specified linked and mapped file, or
+all such files if no file is specified.
+
+=back
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use FindBin;
+my $mig_bin = "$FindBin::Bin/mig-bin/";
+use lib "$FindBin::Bin/mig-bin";
+use Mig;
+
+pod2usage(-verbose => 2) if ! $ARGV[0];
+switch($ARGV[0]) {
+ case "help" {
+ if (defined $ARGV[1]) {
+ my $cmd = $mig_bin . "mig-$ARGV[1]";
+ if (-e $cmd) {
+ system( $mig_bin . "mig-$ARGV[1]", '--help' );
+ } else {
+ pod2usage(-verbose => 2);
+ }
+ } else {
+ pod2usage(-verbose => 2);
+ }
+ }
+ case "env" {
+ standard_invocation(@ARGV);
+ }
+ case "init" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "status" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "add" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "remove" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "iconv" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "skip-iconv" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "clean" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "skip-clean" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "link" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "unlink" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "convert" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "stage" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "mapper" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "quicksheet" {
+ Mig::die_if_no_env_migschema();
+ standard_invocation(@ARGV);
+ }
+ case "map" {
+ Mig::die_if_no_env_migschema();
+ }
+ case "load" {
+ Mig::die_if_no_env_migschema();
+ }
+ case "wdir" {
+ print "$MIGWORKDIR\n";
+ }
+ case "gdir" {
+ print "$MIGBASEGITDIR\n";
+ }
+ case "sdir" {
+ print "$MIGGITDIR\n";
+ }
+ else {
+ pod2usage(1);
+ }
+}
+
+sub standard_invocation {
+ my $cmd = shift;
+ system( $mig_bin . "mig-$cmd", @_ );
+}
+
+
--- /dev/null
+package Mig;
+
+use strict;
+use Exporter;
+use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
+
+$VERSION = 1.00;
+@ISA = qw(Exporter);
+@EXPORT = ();
+@EXPORT_OK = qw();
+%EXPORT_TAGS = (
+ DEFAULT => []
+);
+
+use DBI;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+
+sub db_connect {
+ my $dbh = DBI->connect(
+ "dbi:Pg:host=$PGHOST;dbname=$PGDATABASE;port=$PGPORT"
+ ,$PGUSER
+ ,undef
+ ) || die "Unable to connect to $PGHOST:$PGPORT:$PGDATABASE:$PGUSER : $!\n";
+ return $dbh;
+}
+sub db_disconnect {
+ my $dbh = shift;
+ $dbh->disconnect;
+}
+
+sub sql {
+ my $sql = shift;
+ chomp $sql;
+ $sql =~ s/\n//g;
+ print "\n$sql\n";
+ return $sql;
+}
+
+sub die_if_no_env_migschema {
+ die "MIGSCHEMA environment variable not set. See 'mig env help'\n"
+ unless $MIGSCHEMA;
+}
+
+sub check_for_db_migschema {
+ my $dbh = db_connect();
+ my $sth = $dbh->prepare("
+ SELECT EXISTS(
+ SELECT 1
+ FROM pg_namespace
+ WHERE nspname = ?
+ );"
+ );
+ my $rv = $sth->execute($MIGSCHEMA)
+ || die "Error checking for migration schema ($MIGSCHEMA): $!";
+ my @cols = $sth->fetchrow_array;
+ $sth->finish;
+ my $found;
+ if ($cols[0]) {
+ print "Found migration schema ($MIGSCHEMA) at $PGHOST:$PGPORT:$PGDATABASE:$PGUSER\n";
+ $found = 1;
+ } else {
+ print "Migration schema ($MIGSCHEMA) does not exist at $PGHOST:$PGPORT:$PGDATABASE:$PGUSER\n";
+ $found = 0;
+ }
+ db_disconnect($dbh);
+ return $found;
+}
+
+sub check_db_migschema_for_migration_tables {
+ my $found = check_db_migschema_for_specific_table('asset_copy');
+ if (!$found) {
+ print "Missing migration tables (such as $MIGSCHEMA.asset_copy)\n";
+ }
+ return $found;
+}
+
+sub check_db_migschema_for_specific_table {
+ my $table = shift;
+ my $dbh = db_connect();
+ my $sth = $dbh->prepare("
+ SELECT EXISTS(
+ SELECT 1
+ FROM information_schema.tables
+ WHERE table_schema = " . $dbh->quote( $MIGSCHEMA ) . "
+ AND table_name = " . $dbh->quote( $table ) . "
+ );"
+ );
+ my $rv = $sth->execute()
+ || die "Error checking migration schema ($MIGSCHEMA) for table ($table): $!";
+ my @cols = $sth->fetchrow_array;
+ $sth->finish;
+ my $found;
+ if ($cols[0]) {
+ $found = 1;
+ } else {
+ $found = 0;
+ }
+ db_disconnect($dbh);
+ return $found;
+}
+
+sub check_for_migration_tools {
+ my $dbh = db_connect();
+ my $sth = $dbh->prepare("
+ SELECT EXISTS(
+ SELECT 1
+ FROM pg_namespace
+ WHERE nspname = 'migration_tools'
+ );"
+ );
+ my $rv = $sth->execute()
+ || die "Error checking for migration_tools schema: $!";
+ my @cols = $sth->fetchrow_array;
+ $sth->finish;
+ db_disconnect($dbh);
+ return $cols[0];
+}
+
+sub die_if_no_migration_tools {
+ if (check_for_migration_tools()) {
+ print "Found migration_tools schema\n";
+ } else {
+ die "Missing migration_tools schema\n";
+ }
+}
+
+sub check_for_mig_tracking_table {
+ my $dbh = db_connect();
+ my $sth = $dbh->prepare("
+ SELECT EXISTS(
+ SELECT 1
+ FROM information_schema.tables
+ WHERE table_schema = " . $dbh->quote( $MIGSCHEMA ) . "
+ AND table_name = 'tracked_file'
+ );"
+ );
+ my $rv = $sth->execute()
+ || die "Error checking for table (tracked_file): $!";
+ my @cols = $sth->fetchrow_array;
+ $sth->finish;
+ db_disconnect($dbh);
+ return $cols[0];
+}
+
+sub die_if_mig_tracking_table_exists {
+ if (check_for_mig_tracking_table()) {
+ die "Table $MIGSCHEMA.tracked_file already exists. Bailing init...\n";
+ }
+}
+
+sub die_if_mig_tracking_table_does_not_exist {
+ if (!check_for_mig_tracking_table()) {
+ die "Table $MIGSCHEMA.tracked_file does not exist. Bailing...\n";
+ }
+}
+
+sub check_for_mig_column_tracking_table {
+ my $dbh = db_connect();
+ my $sth = $dbh->prepare("
+ SELECT EXISTS(
+ SELECT 1
+ FROM information_schema.tables
+ WHERE table_schema = " . $dbh->quote( $MIGSCHEMA ) . "
+ AND table_name = 'tracked_column'
+ );"
+ );
+ my $rv = $sth->execute()
+ || die "Error checking for table (tracked_column): $!";
+ my @cols = $sth->fetchrow_array;
+ $sth->finish;
+ db_disconnect($dbh);
+ return $cols[0];
+}
+
+sub die_if_mig_column_tracking_table_exists {
+ if (check_for_mig_column_tracking_table()) {
+ die "Table $MIGSCHEMA.tracked_column already exists. Bailing init...\n";
+ }
+}
+
+sub die_if_mig_column_tracking_table_does_not_exist {
+ if (!check_for_mig_column_tracking_table()) {
+ die "Table $MIGSCHEMA.tracked_column does not exist. Bailing...\n";
+ }
+}
+
+sub check_for_tracked_file {
+ my $file = shift;
+ my $options = shift;
+ if (! -e $file) {
+ die "file not found: $file\n" unless $options && $options->{'allow_missing'};
+ }
+ my $dbh = db_connect();
+ my $sth = $dbh->prepare("
+ SELECT id
+ FROM $MIGSCHEMA.tracked_file
+ WHERE base_filename = " . $dbh->quote( $file ) . ";"
+ );
+ my $rv = $sth->execute()
+ || die "Error checking table (tracked_file) for base_filename ($file): $!";
+ my @cols = $sth->fetchrow_array;
+ $sth->finish;
+ db_disconnect($dbh);
+ return $cols[0];
+}
+
+sub check_for_tracked_column {
+ my ($table,$column,$options) = (shift,shift,shift);
+ my $dbh = db_connect();
+ my $sth = $dbh->prepare("
+ SELECT id
+ FROM $MIGSCHEMA.tracked_column
+ WHERE staged_table = " . $dbh->quote( $table ) . "
+ AND staged_column = " . $dbh->quote( $column ) . ";"
+ );
+ my $rv = $sth->execute()
+ || die "Error checking table (tracked_column) for $table.$column: $!";
+ my @cols = $sth->fetchrow_array;
+ $sth->finish;
+ db_disconnect($dbh);
+ return $cols[0];
+}
+
+sub status_this_file {
+ my $file = shift;
+ my $dbh = db_connect();
+ my $sth = $dbh->prepare("
+ SELECT *
+ FROM $MIGSCHEMA.tracked_file
+ WHERE base_filename = " . $dbh->quote( $file ) . ";"
+ );
+ my $rv = $sth->execute()
+ || die "Error retrieving data from table (tracked_file) for base_filename ($file): $!";
+ my $data = $sth->fetchrow_hashref;
+ $sth->finish;
+ db_disconnect($dbh);
+ return $data;
+}
+
+sub status_this_column {
+ my ($table,$column) = (shift,shift);
+ my $dbh = db_connect();
+ my $sth = $dbh->prepare("
+ SELECT *
+ FROM $MIGSCHEMA.tracked_column
+ WHERE staged_table = " . $dbh->quote( $table ) . "
+ AND staged_column = " . $dbh->quote( $column ) . ";"
+ );
+ my $rv = $sth->execute()
+ || die "Error checking table (tracked_column) for $table.$column: $!";
+ my $data = $sth->fetchrow_hashref;
+ $sth->finish;
+ db_disconnect($dbh);
+ return $data;
+}
+
+1;
+
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-add - This will add the specified files to the mig tracking table for the
+schema pointed to by the MIGSCHEMA environment variable in the PostgreSQL
+database specified by various PG environment variables. <inhales, exhales>
+
+--headers (the default) and --no-headers are repeatable, and indicate whether
+subsequent files have headers or not
+
+--headers-file specifies a text file <hfile> defining the column headers for
+the next added <file>, which should contain one line per header
+
+--headers-file will automatically invoke --no-headers
+
+You'll need to invoke B<mig-init> prior to using commands like B<mig-add>
+
+=head1 SYNOPSIS
+
+B<mig-add> [--no-headers|--headers|--headers-file <hfile>] <file> [file|--no-headers|--headers|--headers-file <hfile>] [...]
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+my $has_headers = 1;
+my $headers_file;
+my $next_arg_is_headers_file = 0;
+
+foreach my $arg (@ARGV) {
+ if ($next_arg_is_headers_file) {
+ $next_arg_is_headers_file = 0;
+ $headers_file = abs_path($arg);
+ next;
+ }
+ if ($arg eq '--headers') {
+ $has_headers = 1;
+ next;
+ }
+ if ($arg eq '--no-headers') {
+ $has_headers = 0;
+ next;
+ }
+ if ($arg eq '--headers-file') {
+ $next_arg_is_headers_file = 1;
+ $has_headers = 0;
+ next;
+ }
+ my $file = abs_path($arg);
+ if ($file =~ /^$MIGBASEWORKDIR/) {
+ if (-e $file) {
+ if (-f $file) {
+ add_this_file($file,$has_headers,$headers_file);
+ $headers_file = ''; # clear after applying to just one file
+ } else {
+ print "Not a real file: $file\n";
+ }
+ } else {
+ print "Could not find file: $file\n";
+ }
+ } else {
+ print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n";
+ }
+}
+
+exit 0;
+
+###############################################################################
+
+sub add_this_file {
+ my $file = shift;
+ my $headers = shift;
+ my $headers_file = shift;
+ if ($headers_file) {
+ if (! (-e $headers_file && -f $headers_file)) {
+ print "Could not find headers file $headers_file, skipping $file\n";
+ return;
+ }
+ }
+ if (Mig::check_for_tracked_file($file)) {
+ print "File already tracked: $file\n";
+ } else {
+ print 'Adding (';
+ if ($headers_file) {
+ print "with headers file = $headers_file";
+ } else {
+ print ($headers ? ' with headers' : 'without headers');
+ }
+ print '): ' . "$file\n";
+ my $dbh = Mig::db_connect();
+ my $rv = $dbh->do("
+ INSERT INTO $MIGSCHEMA.tracked_file (
+ base_filename
+ ,has_headers
+ ,headers_file
+ ) VALUES (
+ " . $dbh->quote($file) . "
+ ," . $dbh->quote($headers) . "
+ ," . $dbh->quote($headers_file) . "
+ );
+ ") || die "Error inserting into table $MIGSCHEMA.tracked_file: $!\n";
+ Mig::db_disconnect($dbh);
+ }
+}
+
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-clean
+
+Attempts to invoke B<clean_csv> on the specified tracked file, placing the
+output in [file].clean
+
+If given no other arguments, the invocation will lool like
+
+=over 5
+
+clean_csv --config scripts/clean.conf --fix --apply [--create-headers|--use-headers <hfile>] <file>
+
+=back
+
+otherwise, the arguments will be passed through like so
+
+=over 5
+
+clean_csv [other arguments...] <file>
+
+=back
+
+You'll need to invoke B<mig-iconv> or B<mig-skip-iconv> prior to using commands
+like B<mig-clean>
+
+=head1 SYNOPSIS
+
+B<mig-clean> <file> [other arguments...]
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+my $file = abs_path($ARGV[0]);
+if ($file =~ /^$MIGBASEWORKDIR/) {
+ call_clean_csv(@ARGV);
+} else {
+ print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n";
+}
+
+exit 0;
+
+###############################################################################
+
+sub call_clean_csv {
+ my $file = abs_path(shift);
+ my @args = @_;
+
+ my $tracked_file_id = Mig::check_for_tracked_file($file);
+ if ($tracked_file_id) {
+ my $data = Mig::status_this_file($file);
+
+ if (! $data->{'utf8_filename'}) {
+ die "mig-iconv or mig-skip-iconv needed for UTF8 version of file: $file\n";
+ }
+
+ my $utf8_file = $data->{'utf8_filename'};
+ if (! -e $utf8_file) {
+ die "missing file: $utf8_file\n";
+ }
+
+ print "cleaning tracked file: $file\n";
+
+ if (scalar(@args) == 0) {
+ @args = (
+ '--config'
+ ,'scripts/clean.conf'
+ ,'--fix'
+ ,'--apply'
+ ,'--backslash'
+ ,'--pad'
+ );
+ if (! $data->{'has_headers'}) {
+ if ($data->{'headers_file'}) {
+ push @args, '--use-headers';
+ push @args, $data->{'headers_file'};
+ } else {
+ push @args, '--create-headers';
+ }
+ }
+ }
+
+ print join(' ',@args) . "\n";
+ system('clean_csv', @args, $utf8_file);
+
+ my $dbh = Mig::db_connect();
+ my $clean_file = $dbh->quote($utf8_file . '.clean');
+ if (! -e $utf8_file . '.clean') {
+ print "clean file does not exist: $clean_file\n";
+ $clean_file = $dbh->quote('');
+ }
+
+ my $rv = $dbh->do("
+ UPDATE $MIGSCHEMA.tracked_file
+ SET clean_filename = $clean_file
+ WHERE base_filename = " . $dbh->quote($file) . "
+ ;
+ ") || die "Error inserting into table $MIGSCHEMA.tracked_file: $!\n";
+ Mig::db_disconnect($dbh);
+ } else {
+ print "File not currently tracked: $file\n";
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-convert
+
+Attempts to invoke B<csv2sql> on the .utf8.clean version of the specified
+tracked file, creating either [file].utf8.clean.stage.sql or
+<parent table>_stage.sql depending on whether the file has been linked to a
+parent table within the migration schema or not.
+
+If given no other arguments, the invocation will lool like
+
+=over 5
+
+csv2sql --config scripts/clean.conf --add-x-migrate --schema <MIGSCHEMA> [--parent <PARENT TABLE>] --outfile <[<FILE>.utf8.clean.stage.sql]|[parent_table_stage.sql]> <FILE>.utf8.clean
+
+=back
+
+otherwise, the arguments will be passed through like so
+
+=over 5
+
+csv2sql [other arguments...] --schema <MIGSCHEMA> [--parent <PARENT TABLE>] --outfile <[<FILE>.utf8.clean.stage.sql]|[parent_table_stage.sql]> <FILE>.utf8.clean
+
+=back
+
+=head1 SYNOPSIS
+
+B<mig-convert> <file> [other arguments...]
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+my $file = abs_path($ARGV[0]);
+if ($file =~ /^$MIGBASEWORKDIR/) {
+ call_convert_csv(@ARGV);
+} else {
+ print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n";
+}
+
+exit 0;
+
+###############################################################################
+
+sub call_convert_csv {
+ my $file = abs_path(shift);
+ my @args = @_;
+
+ my $stage_sql_filename;
+ my $tracked_file_id = Mig::check_for_tracked_file($file);
+ if ($tracked_file_id) {
+ my $data = Mig::status_this_file($file);
+
+ if (! $data->{'utf8_filename'}) {
+ die "mig-iconv or mig-skip-iconv needed for UTF8 version of file: $file\n";
+ }
+
+ if (! $data->{'clean_filename'}) {
+ die "mig-clean or mig-skip-clean needed for .clean version of file: $file\n";
+ }
+
+ my $clean_file = $data->{'clean_filename'};
+ if (! -e $clean_file) {
+ die "missing file: $clean_file\n";
+ }
+
+ print "converting tracked file: $file\n";
+
+ if (scalar(@args) == 0) {
+ @args = (
+ '--config'
+ ,'scripts/clean.conf'
+ ,'--add-x-migrate'
+ );
+ }
+ push @args, '--use-no-headers-file';
+ push @args, '--schema';
+ push @args, $MIGSCHEMA;
+ if ($data->{'parent_table'}) {
+ push @args, '--parent';
+ push @args, $data->{'parent_table'};
+ $stage_sql_filename = $data->{'parent_table'} . '.stage.sql';
+ } else {
+ $stage_sql_filename = "$clean_file.stage.sql";
+ }
+ push @args, '--outfile';
+ push @args, $stage_sql_filename;
+
+ print "args: " . join(',',@args) . "\n";
+ system('csv2sql', @args, $clean_file);
+
+ my $dbh = Mig::db_connect();
+ if (! -e $stage_sql_filename) {
+ print "SQL converted file does not exist: $stage_sql_filename\n";
+ $stage_sql_filename = '';
+ }
+
+ my $rv = $dbh->do("
+ UPDATE $MIGSCHEMA.tracked_file
+ SET stage_sql_filename = " . $dbh->quote($stage_sql_filename) . "
+ WHERE base_filename = " . $dbh->quote($file) . "
+ ;
+ ") || die "Error updating table $MIGSCHEMA.tracked_file: $!\n";
+ Mig::db_disconnect($dbh);
+ } else {
+ print "File not currently tracked: $file\n";
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-env - This tool is for tracking and setting environment variables used by
+B<mig> and its sub-tools.
+
+=head1 SYNOPSIS
+
+B<mig-env> <create|use> <migration_schema>
+
+B<mig-env> <show> [migration_schema]
+
+B<mig-env> <list>
+
+B<mig-env> <help>
+
+=head1 DESCRIPTION
+
+For most invocations, B<mig-env> will either create or use a migration-specific
+file (~/.mig/<migration_schema>.env) for setting the following environment
+variables:
+
+=over 15
+
+=item MIGSCHEMA
+
+The name of the migration schema. Convention has this being a single lowercased
+word or acronym identifying the library, prefixed with 'm_'.
+
+=item MIGWORKDIR
+
+The base working directory for containing migration data, scripts, and other
+files.
+
+=item PGHOST
+
+The IP address or hostname for the PostgreSQL database used for a migration.
+
+=item PGPORT
+
+The TCP port for the PostgreSQL database.
+
+=item PGUSER
+
+The PostgreSQL user to use for the database.
+
+=item PGDATABASE
+
+The name of the actual database containing the migration schema.
+
+=back
+
+This script may also setup a symlink from a specified Git repository to a
+scripts/ directory within the migration work directory. The default for this is
+~/git/migration-work/past_migrations/MIGSCHEMA --> MIGWORKDIR/scripts
+
+It may also create the migration work directory if necessary.
+
+=head1 COMMANDS
+
+=over 15
+
+=item B<create> <schema>
+
+This invocation will prompt for various values and create a .env file for the
+specified migration schema, and a symlink between the specified Git repository
+and migration work directory (which will also be created if needed).
+
+=item B<use> <schema>
+
+This command will spawn a bash shell that executes the corresponding
+~/.mig/<schema>.env script for setting up environment variables encoded during
+B<create>.
+
+=item B<show> [schema]
+
+This command will show the contents of the corresponding ~/.mig/<schema>.env
+script, or, if no schema is specified, then it will list pertinent variables in
+the current environment if they exist.
+
+=item B<list>
+
+This command will list migration schemas found in ~/.mig
+
+=item B<help>
+
+Display the documentation you're reading now.
+
+=back
+
+=cut
+
+###############################################################################
+
+use strict;
+use 5.012;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use File::Path qw(make_path);
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+
+pod2usage(-verbose => 2) if ! $ARGV[0];
+
+my $migration_schema = $ARGV[1] || '';
+my $filename = "$HOME/.mig/$migration_schema.env";
+switch($ARGV[0]) {
+ case "--help" {
+ pod2usage(-verbose => 2);
+ }
+ case "help" {
+ pod2usage(-verbose => 2);
+ }
+ case "create" {
+ pod2usage(-verbose => 1) if ! $ARGV[1];
+ mig_env_create();
+ }
+ case "use" {
+ pod2usage(-verbose => 1) if ! $ARGV[1];
+ if (-e $filename) {
+ exec '/bin/bash', '--init-file', $filename;
+ } else {
+ die "\n$filename does not exist\n";
+ }
+ }
+ case "show" {
+ if (-e $filename) {
+ exec '/bin/cat', $filename;
+ } else {
+ print `env | sort | egrep 'MIG|PG'`;
+ }
+ }
+ case "list" {
+ opendir(my $dh, "$HOME/.mig") || die "can't open $HOME/.mig: $!";
+ while (readdir $dh) {
+ if (/^(.*)\.env$/) {
+ print "$1\n";
+ }
+ }
+ closedir $dh;
+ }
+ else {
+ pod2usage(1);
+ }
+}
+
+sub mig_env_create {
+ if (-e $filename) {
+ print "Re-Creating $filename\n";
+ print `cat $filename`;
+ } else {
+ print "Creating $filename\n";
+ }
+ print "\n";
+
+ # directories
+
+ $MIGBASEWORKDIR = "$HOME/data/" unless $MIGBASEWORKDIR;
+ my $migworkdir_default = "$MIGBASEWORKDIR$migration_schema/";
+ print "Main work directory (default $migworkdir_default): ";
+ my $MIGWORKDIR = <STDIN>;
+ chomp $MIGWORKDIR;
+ if (! $MIGWORKDIR) {
+ $MIGWORKDIR = $migworkdir_default;
+ }
+ $MIGBASEGITDIR = "$HOME/git/migration-work/" unless $MIGBASEGITDIR;
+ my $miggitdir_default = "${MIGBASEGITDIR}past_migrations/$migration_schema/";
+ print "git repo for migration-specific scripts (default $miggitdir_default): ";
+ my $MIGGITDIR = <STDIN>;
+ chomp $MIGGITDIR;
+ if (! $MIGGITDIR) {
+ $MIGGITDIR = $miggitdir_default;
+ }
+
+ # PostgreSQL
+
+ $PGHOST = 'localhost' unless $PGHOST;
+ my $pghost_default = $PGHOST;
+ print "PGHOST (default $pghost_default): ";
+ $PGHOST = <STDIN>;
+ chomp $PGHOST;
+ if (! $PGHOST) {
+ $PGHOST = $pghost_default;
+ }
+ $PGPORT = 5432 unless $PGPORT;
+ my $pgport_default = $PGPORT;
+ print "PGPORT (default $pgport_default): ";
+ $PGPORT = <STDIN>;
+ chomp $PGPORT;
+ if (! $PGPORT) {
+ $PGPORT = $pgport_default;
+ }
+ $PGDATABASE = 'evergreen' unless $PGDATABASE;
+ my $pgdatabase_default = $PGDATABASE;
+ print "PGDATABASE (default $pgdatabase_default): ";
+ $PGDATABASE = <STDIN>;
+ chomp $PGDATABASE;
+ if (! $PGDATABASE) {
+ $PGDATABASE = $pgdatabase_default;
+ }
+ $PGUSER = $PGDATABASE unless $PGUSER;
+ my $pguser_default = $PGUSER;
+ print "PGUSER (default $pguser_default): ";
+ my $PGUSER = <STDIN>;
+ chomp $PGUSER;
+ if (! $PGUSER) {
+ $PGUSER = $pguser_default;
+ }
+
+ # create files and directories if needed
+
+ mkdir "$HOME/.mig";
+ make_path($MIGGITDIR, { verbose => 1 });
+ `touch $MIGGITDIR/README`;
+ make_path($MIGWORKDIR, { verbose => 1 });
+ symlink $MIGGITDIR, "$MIGWORKDIR/scripts";
+ open FILE, ">$filename";
+ print FILE "export PGHOST=$PGHOST\n";
+ print FILE "export PGPORT=$PGPORT\n";
+ print FILE "export PGDATABASE=$PGDATABASE\n";
+ print FILE "export PGUSER=$PGUSER\n";
+ print FILE "export MIGENVPROMPT=$migration_schema\n";
+ print FILE "export MIGSCHEMA=$migration_schema\n";
+ print FILE "export MIGBASEWORKDIR=$MIGBASEWORKDIR\n";
+ print FILE "export MIGWORKDIR=$MIGWORKDIR\n";
+ print FILE "export MIGBASEGITDIR=$MIGBASEGITDIR\n";
+ print FILE "export MIGGITDIR=$MIGGITDIR\n";
+ print FILE "alias wcd='cd `mig wdir`'\n";
+ print FILE "alias gcd='cd `mig gdir`'\n";
+ print FILE "alias scd='cd `mig sdir`'\n";
+ print FILE "source ~/.profile\n";
+ print FILE "env | sort | egrep 'PG|MIG'\n";
+ print FILE 'echo shell PID = $$' . "\n";
+ close FILE;
+}
+
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-iconv
+
+Attempts to invoke B<iconv> on the specified tracked file, placing the
+output in [file].iconv
+
+If given no other arguments, the invocation will lool like
+
+=over 5
+
+iconv -f ISO-8859-1 -t UTF-8 -o <file>.utf8 <file>
+
+=back
+
+otherwise, the arguments will be passed through like so
+
+=over 5
+
+iconv [other arguments...] -o <file>.utf8 <file>
+
+=back
+
+You'll need to invoke B<mig-add> prior to using commands like B<mig-iconv>
+
+=head1 SYNOPSIS
+
+B<mig-iconv> <file> [other arguments...]
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+my $file = abs_path($ARGV[0]);
+if ($file =~ /^$MIGBASEWORKDIR/) {
+ call_iconv(@ARGV);
+} else {
+ print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n";
+}
+
+exit 0;
+
+###############################################################################
+
+sub call_iconv {
+ my $file = abs_path(shift);
+ my @args = @_;
+
+ my $tracked_file_id = Mig::check_for_tracked_file($file);
+ if ($tracked_file_id) {
+ my $data = Mig::status_this_file($file);
+ print "iconv'ing tracked file: $file\n";
+
+ if (scalar(@args) == 0) {
+ @args = (
+ '-f'
+ ,'ISO-8859-1'
+ ,'-t'
+ ,'UTF-8'
+ ,'--verbose'
+ );
+ }
+
+ system('iconv', @args, '-o', $file . '.utf8', $file);
+ system('touch', $file . '.utf8'); # handle 0-byte files
+
+ my $dbh = Mig::db_connect();
+ my $utf8_file = $dbh->quote($file . '.utf8');
+ if (! -e $file . '.utf8') {
+ print "utf8 file does not exist: $utf8_file\n";
+ $utf8_file = $dbh->quote('');
+ }
+
+ my $rv = $dbh->do("
+ UPDATE $MIGSCHEMA.tracked_file
+ SET utf8_filename = $utf8_file
+ WHERE base_filename = " . $dbh->quote($file) . "
+ ;
+ ") || die "Error inserting into table $MIGSCHEMA.tracked_file: $!\n";
+ Mig::db_disconnect($dbh);
+ } else {
+ print "File not currently tracked: $file\n";
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-init - This will add or recreate tracking tables for the B<mig> toolset to
+the migration schema specified by the MIGSCHEMA environment variable, in the
+PostgreSQL database specified by various PG environment variables.
+
+In practice, you should invoke 'mig env use schema_name' prior to calling
+B<init>
+
+=head1 SYNOPSIS
+
+B<mig-init>
+
+B<mig-init> <help>
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if $ARGV[0];
+
+Mig::die_if_no_env_migschema();
+
+if (! Mig::check_for_db_migschema()) {
+ try_to_create_schema();
+}
+
+if (! Mig::check_db_migschema_for_migration_tables()) {
+ try_to_init_schema_with_migration_tools();
+}
+vivicate_mig_tracking_tables();
+
+exit 0;
+
+###############################################################################
+
+sub try_to_create_schema {
+ if ($MIGSCHEMA =~ /[^\w_]/) {
+ die "$MIGSCHEMA is not suitable for a schema name in PostgreSQL\n";
+ }
+ my $dbh = Mig::db_connect();
+ my $rv = $dbh->do("CREATE SCHEMA $MIGSCHEMA;")
+ || die "Error creating migration schema ($MIGSCHEMA): $!\n";
+ print "Created schema $MIGSCHEMA\n";
+ Mig::db_disconnect($dbh);
+}
+
+sub try_to_init_schema_with_migration_tools {
+ Mig::die_if_no_migration_tools();
+ print "Calling migration_tools.init() and .build()\n";
+ my $dbh = Mig::db_connect();
+ my $rv = $dbh->do("SELECT migration_tools.init(" . $dbh->quote($MIGSCHEMA) . ");")
+ || die "Error running migration_tools.init($MIGSCHEMA): $!\n";
+ print "migration_tools.init() finished\n";
+ my $rv2 = $dbh->do("SELECT migration_tools.build(" . $dbh->quote($MIGSCHEMA) . ");")
+ || die "Error running migration_tools.build($MIGSCHEMA): $!\n";
+ print "migration_tools.build() finished\n";
+ Mig::db_disconnect($dbh);
+}
+
+sub vivicate_mig_tracking_tables {
+ Mig::die_if_mig_tracking_table_exists();
+ print "Creating table $MIGSCHEMA.tracked_file...\n";
+ my $dbh = Mig::db_connect();
+ my $rv = $dbh->do("
+ CREATE TABLE $MIGSCHEMA.tracked_file (
+ id serial
+ ,base_filename TEXT UNIQUE
+ ,has_headers BOOLEAN
+ ,headers_file TEXT
+ ,utf8_filename TEXT
+ ,clean_filename TEXT
+ ,stage_sql_filename TEXT
+ ,map_sql_filename TEXT
+ ,prod_sql_filename TEXT
+ ,parent_table TEXT
+ ,staged_table TEXT
+ );
+ ") || die "Error creating table $MIGSCHEMA.tracked_file: $!\n";
+ Mig::die_if_mig_column_tracking_table_exists();
+ my $rv2 = $dbh->do("
+ CREATE TABLE $MIGSCHEMA.tracked_column (
+ id serial
+ ,base_filename TEXT
+ ,parent_table TEXT
+ ,staged_table TEXT
+ ,staged_column TEXT
+ ,comment TEXT
+ ,target_table TEXT
+ ,target_column TEXT
+ ,transform TEXT
+ ,summarize BOOLEAN
+ );
+ ") || die "Error creating table $MIGSCHEMA.tracked_column: $!\n";
+ my $rv3 = $dbh->do("
+ CREATE INDEX ON $MIGSCHEMA.tracked_column(target_table,target_column);
+ ") || die "Error creating index on $MIGSCHEMA.tracked_column: $!\n";
+ my $rv4 = $dbh->do("
+ CREATE INDEX ON $MIGSCHEMA.tracked_column(base_filename);
+ ") || die "Error creating index on $MIGSCHEMA.tracked_column: $!\n";
+ Mig::db_disconnect($dbh);
+}
+
+
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-link
+
+Associate the specified file with a parent table within the migration schema.
+
+=head1 SYNOPSIS
+
+B<mig-link> <file> <parent table>
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+my $file = abs_path($ARGV[0]);
+if ($file =~ /^$MIGBASEWORKDIR/) {
+ link_table(@ARGV);
+} else {
+ print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n";
+}
+
+exit 0;
+
+###############################################################################
+
+sub link_table {
+ my $file = abs_path(shift);
+ my $table = shift;
+
+ if (! Mig::check_db_migschema_for_specific_table($table)) {
+ die "table not found in MIGSCHEMA ($MIGSCHEMA): $table\n";
+ }
+
+ my $tracked_file_id = Mig::check_for_tracked_file($file);
+ if ($tracked_file_id) {
+ my $data = Mig::status_this_file($file);
+
+ print "linking file to parent table: $file -> $table\n";
+
+ my $dbh = Mig::db_connect();
+ my $sth = $dbh->prepare("
+ SELECT base_filename
+ FROM $MIGSCHEMA.tracked_file
+ WHERE parent_table = " . $dbh->quote($table) . "
+ AND base_filename <> " . $dbh->quote($file) . ";"
+ );
+ my $rv = $sth->execute()
+ || die "Error checking $MIGSCHEMA.tracked_file: $!";
+ my @cols = $sth->fetchrow_array;
+ $sth->finish;
+ if ($cols[0]) { # found
+ die "table ($table) already linked to a different file: $cols[0]\n";
+ }
+ $rv = $dbh->do("
+ UPDATE $MIGSCHEMA.tracked_file
+ SET parent_table = " . $dbh->quote($table) . "
+ WHERE base_filename = " . $dbh->quote($file) . "
+ ;
+ ") || die "Error updating table $MIGSCHEMA.tracked_file: $!\n";
+ Mig::db_disconnect($dbh);
+ } else {
+ print "File not currently tracked: $file\n";
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-mapper
+
+Interactive session for analyzing, flagging, and mapping legacy field data to
+Evergreen fields.
+
+Upon exit, generate either [file].clean.map.sql or <parent table>_map.sql. The
+SQL generated will be UPDATE's for setting the Evergreen-specific columns for a
+given file's staging tables, and TRUNCATE's and INSERT's for auxilary tables.
+The files will have \include hooks for pulling in additional mapping files
+(for example, end-user mappings for circ modifiers, etc.)
+
+=head1 SYNOPSIS
+
+B<mig-mapper> <file>
+
+=cut
+
+###############################################################################
+
+use strict;
+use Term::ReadLine;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+my $column_filter = 1; # show all fields
+my $file = abs_path($ARGV[0]);
+my $fdata;
+my $tracked_file_id = Mig::check_for_tracked_file($file);
+if ($tracked_file_id) {
+ $fdata = Mig::status_this_file($file);
+} else {
+ die "File not currently tracked: $file\n";
+}
+
+my $table = $fdata->{staged_table};
+if (!$table) {
+ die "No staged staged table for file: $file\n";
+}
+
+my $loop = 1;
+my $term = Term::ReadLine->new('mapper');
+my $prompt;
+my $OUT = $term->OUT || \*STDOUT;
+my @dtd_identifiers;
+
+table_menu();
+$prompt = "$fdata->{staged_table}: ";
+while ( $loop && defined (my $cmd = $term->readline($prompt)) ) {
+top:
+ $cmd =~ s/^\s+//;
+ $cmd =~ s/\s+$//;
+ $term->addhistory($cmd) if $cmd =~ /\S/;
+ if ($cmd =~ /^\d+$/) {
+ my $ret = column_menu($cmd);
+ if ($ret) {
+ $cmd = $ret;
+ goto top;
+ }
+ } else {
+ switch($cmd) {
+ case /^(ls|\?|\.|;)$/ {
+ table_menu();
+ }
+ case '' {
+ table_menu();
+ }
+ case 'l' {
+ list_ten();
+ }
+ case 'f1' {
+ $column_filter = 1;
+ table_menu();
+ }
+ case 'f2' {
+ $column_filter = 2;
+ table_menu();
+ }
+ case 'f3' {
+ $column_filter = 3;
+ table_menu();
+ }
+ }
+ }
+ $loop = 0 if $cmd =~ /^q/io;
+}
+
+exit 0;
+
+###############################################################################
+
+sub table_menu {
+ print "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
+ print "$table";
+ print "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
+ print "\n";
+ print " l) SELECT * FROM $fdata->{staged_table} LIMIT 10;\n";
+ print "f1) show all fields (default)\n";
+ print "f2) show legacy fields\n";
+ print "f3) show EG fields\n";
+ print " q) quit\n\n";
+ printf "%-36s", "Columns (* for required)";
+ printf "%-30s", "Target";
+ printf "%-30s", "Transform";
+ printf "%-30s", "First Row";
+ printf "%-30s", "Migration Note";
+ print "\n";
+ printf "%-36s", "-------";
+ printf "%-30s", "------";
+ printf "%-30s", "---------";
+ printf "%-30s", "---------";
+ printf "%-30s", "--------------";
+ print "\n";
+ my $dbh = Mig::db_connect();
+ my $sth = $dbh->prepare("
+ SELECT *
+ FROM information_schema.columns
+ WHERE table_schema = " . $dbh->quote($MIGSCHEMA) . "
+ AND table_name = " . $dbh->quote($table) . "
+ ORDER BY dtd_identifier::INTEGER ASC;
+ ");
+ my $rv = $sth->execute()
+ || die "Error retrieving data from information_schema: $!";
+ my $sth2 = $dbh->prepare("
+ SELECT *
+ FROM $MIGSCHEMA.$table
+ LIMIT 1;
+ ");
+ my $rv2 = $sth2->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ my $row = $sth2->fetchrow_hashref;
+
+ open LESS, "|less -F";
+ @dtd_identifiers = ();
+ while (my $data = $sth->fetchrow_hashref) {
+ my $column = $data->{column_name};
+ if ($column_filter == 2 && !($column =~ /^[xl]_/)) {
+ next;
+ }
+ if ($column_filter == 3 && ($column =~ /^[xl]_/)) {
+ next;
+ }
+ my $cdata = status_this_column($column);
+ printf LESS $cdata->{required} ? '*' : ' ';
+ printf LESS "%3s) ", $data->{dtd_identifier};
+ push @dtd_identifiers, $data->{dtd_identifier};
+ printf LESS "%-30s", $column;
+ printf LESS "%-30s", defined $cdata->{target_table}
+ ? ( $cdata->{target_table} ne $table ? $cdata->{target_table} . '.' : '') . $cdata->{target_column}
+ : '';
+ printf LESS "%-30s", defined $cdata->{transform} ? $cdata->{transform} : '';
+ printf LESS "%-30s", defined $$row{$column} ? $$row{$column} : '';
+ printf LESS "%-30s", defined $cdata->{comment} ? $cdata->{comment} : '';
+ print LESS "\n";
+ }
+ close LESS;
+ print "\n";
+ $sth->finish;
+ $sth2->finish;
+ Mig::db_disconnect($dbh);
+}
+
+sub column_menu {
+ my $dtd_identifier = shift;
+ my $dbh = Mig::db_connect();
+ my $sth = $dbh->prepare("
+ SELECT *
+ FROM information_schema.columns
+ WHERE table_schema = " . $dbh->quote($MIGSCHEMA) . "
+ AND table_name = " . $dbh->quote($table) . "
+ AND dtd_identifier = " . $dbh->quote($dtd_identifier) . ";
+ ");
+ my $rv = $sth->execute()
+ || die "Error retrieving data from information_schema: $!";
+ my $data = $sth->fetchrow_hashref;
+ $sth->finish;
+ Mig::db_disconnect($dbh);
+
+ my $column = $data->{column_name};
+
+ my $prompt = "$table.$column: ";
+
+ sub print_menu {
+ my $column = shift;
+ my $cdata = status_this_column($column);
+ print "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
+ print "$column";
+ print "\n-------------------------------------------------------------------------------------------------\n";
+ print " target: " . (
+ defined $cdata->{target_table}
+ ? ( $cdata->{target_table} ne $table ? $cdata->{target_table} . '.' : '') . $cdata->{target_column}
+ : ''
+ ) . "\n";
+ print "transform: " . (defined $cdata->{transform} ? $cdata->{transform} : '') . "\n";
+ print " comment: " . (defined $cdata->{comment} ? $cdata->{comment} : '') . "\n";
+ print "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
+ print "\n";
+ print " l) SELECT $column FROM $fdata->{staged_table} LIMIT 10;\n";
+ print " s) summarize\n" if $column ne 'x_migrate';
+ print " g) group browse\n";
+ print "g2) group browse (order by count desc)\n";
+ print " c) comment\n";
+ print " f) flag for end-user mapping\n";
+ print " t) target\n";
+ print " e) eval/transform\n";
+ print " n) next column\n";
+ print " p) prev column\n";
+ print " q) quit back to table menu\n";
+ print "\n";
+ }
+ print_menu($column);
+
+ my $loop = 1;
+ while ( $loop && defined (my $cmd = $term->readline($prompt)) ) {
+ $cmd =~ s/^\s+//;
+ $cmd =~ s/\s+$//;
+ $term->addhistory($cmd) if $cmd =~ /\S/;
+ $loop = 0 if $cmd =~ /^q/io;
+ switch($cmd) {
+ case /^(ls|\?|\.|;)$/ {
+ print_menu($column);
+ }
+ case '' {
+ print_menu($column);
+ }
+ case 'l' {
+ list_ten($column);
+ }
+ case 's' {
+ summarize($column);
+ }
+ case 'g' {
+ group_browse($column);
+ }
+ case 'g2' {
+ group_browse($column,'GROUP BY 1 ORDER BY 2 DESC');
+ }
+ case /^c/io {
+ if ($cmd =~ /^c\s+(.+)$/) {
+ set_comment($column,$1);
+ }
+ }
+ case /^t/io {
+ if ($cmd =~ /^t\s+(.+)$/) {
+ set_target($column,$1);
+ }
+ }
+ case /^e/io {
+ if ($cmd =~ /^e\s+(.+)$/) {
+ set_transform($column,$1);
+ }
+ }
+ case 'n' {
+ my( $index )= grep { $dtd_identifiers[$_] eq $dtd_identifier } 0..$#dtd_identifiers;
+ return $dtd_identifiers[$index + 1];
+ }
+ case 'p' {
+ my( $index )= grep { $dtd_identifiers[$_] eq $dtd_identifier } 0..$#dtd_identifiers;
+ return $dtd_identifiers[$index - 1];
+ }
+ }
+ }
+}
+
+sub list_ten {
+ my $column = shift;
+
+ my $dbh = Mig::db_connect();
+ my $sth;
+ my $rv;
+ my @cols;
+
+ $sth = $dbh->prepare(Mig::sql("
+ SELECT " . (defined $column ? $column : '*') . "
+ FROM $MIGSCHEMA.$table
+ LIMIT 10;
+ "));
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ print "\n";
+ while (@cols = $sth->fetchrow_array) {
+ print "\t" . join(',',map {defined $_ ? $_ : ''} @cols) . "\n";
+ }
+ print "\n";
+ $sth->finish;
+}
+
+sub summarize {
+ my $column = shift;
+
+ my $count;
+ my $non_empty_count;
+ my $distinct_value_count;
+ my $distinct_integer_value_count;
+ my $distinct_money6_value_count;
+ my $distinct_money8_value_count;
+ my $distinct_date_value_count;
+ my $distinct_timestamptz_value_count;
+
+ my $min_value;
+ my $min_length;
+ my $min_length_min_value;
+ my $max_value;
+ my $max_length;
+ my $max_length_max_value;
+
+ my $min_value_as_integer;
+ my $max_value_as_integer;
+
+ my $min_value_as_money6;
+ my $max_value_as_money6;
+
+ my $min_value_as_money8;
+ my $max_value_as_money8;
+
+ my $min_value_as_date;
+ my $max_value_as_date;
+
+ my $min_value_as_timestamptz;
+ my $max_value_as_timestamptz;
+
+ my $dbh = Mig::db_connect();
+ my $sth;
+ my $rv;
+ my @cols;
+
+ ### count
+ $sth = $dbh->prepare("
+ SELECT COUNT(*)
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $count = $cols[0];
+
+ ### non_empty_count
+ $sth = $dbh->prepare("
+ SELECT COUNT(*)
+ FROM $MIGSCHEMA.$table
+ WHERE $column IS NOT NULL AND BTRIM($column) <> '';
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $non_empty_count = $cols[0];
+
+ ### distinct_value_count
+ $sth = $dbh->prepare("
+ SELECT COUNT(DISTINCT $column)
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $distinct_value_count = $cols[0];
+
+ ### distinct_integer_value_count
+ $sth = $dbh->prepare("
+ SELECT COUNT(DISTINCT migration_tools.attempt_cast($column,'INTEGER')::INTEGER)
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $distinct_integer_value_count = $cols[0];
+
+ ### distinct_money6_value_count
+ $sth = $dbh->prepare("
+ SELECT COUNT(DISTINCT NULLIF(migration_tools.attempt_money6($column,'-0.01'),-0.01))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $distinct_money6_value_count = $cols[0];
+
+ ### distinct_money8_value_count
+ $sth = $dbh->prepare("
+ SELECT COUNT(DISTINCT NULLIF(migration_tools.attempt_money($column,'-0.01'),-0.01))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $distinct_money8_value_count = $cols[0];
+
+ ### distinct_date_value_count
+ $sth = $dbh->prepare("
+ SELECT COUNT(DISTINCT NULLIF(migration_tools.attempt_date($column,'1969-06-09'),'1969-06-09'::DATE))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $distinct_date_value_count = $cols[0];
+
+ ### distinct_timestamptz_value_count
+ $sth = $dbh->prepare("
+ SELECT COUNT(DISTINCT NULLIF(migration_tools.attempt_timestamptz($column,'1969-06-09'),'1969-06-09'::TIMESTAMPTZ))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $distinct_timestamptz_value_count = $cols[0];
+
+ ### min_value
+ $sth = $dbh->prepare("
+ SELECT MIN($column)
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $min_value = $cols[0];
+
+ ### min_length
+ $sth = $dbh->prepare("
+ SELECT MIN(LENGTH($column))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $min_length = $cols[0];
+
+ ### min_length_min_value
+ $sth = $dbh->prepare("
+ SELECT MIN($column)
+ FROM $MIGSCHEMA.$table
+ WHERE LENGTH($column) = $min_length;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $min_length_min_value = $cols[0];
+
+ ### min_value_as_integer
+ $sth = $dbh->prepare("
+ SELECT MIN(migration_tools.attempt_cast($column,'INTEGER')::INTEGER)
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $min_value_as_integer = $cols[0];
+
+ ### min_value_as_money6
+ $sth = $dbh->prepare("
+ SELECT MIN(NULLIF(migration_tools.attempt_money6($column,'-0.01'),-0.01))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $min_value_as_money6 = $cols[0];
+
+ ### min_value_as_money8
+ $sth = $dbh->prepare("
+ SELECT MIN(NULLIF(migration_tools.attempt_money($column,'-0.01'),-0.01))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $min_value_as_money8 = $cols[0];
+
+ ### min_value_as_date
+ $sth = $dbh->prepare("
+ SELECT MIN(NULLIF(migration_tools.attempt_date($column,'1969-06-09'),'1969-06-09'::DATE))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $min_value_as_date = $cols[0];
+
+ ### min_value_as_timestamptz
+ $sth = $dbh->prepare("
+ SELECT MIN(NULLIF(migration_tools.attempt_timestamptz($column,'1969-06-09'),'1969-06-09'::TIMESTAMPTZ))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $min_value_as_timestamptz = $cols[0];
+
+ ### max_value
+ $sth = $dbh->prepare("
+ SELECT MAX($column)
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $max_value = $cols[0];
+
+ ### max_length
+ $sth = $dbh->prepare("
+ SELECT MAX(LENGTH($column))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $max_length = $cols[0];
+
+ ### max_length_max_value
+ $sth = $dbh->prepare("
+ SELECT MAX($column)
+ FROM $MIGSCHEMA.$table
+ WHERE LENGTH($column) = $max_length;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $max_length_max_value = $cols[0];
+
+ ### max_value_as_integer
+ $sth = $dbh->prepare("
+ SELECT MAX(migration_tools.attempt_cast($column,'INTEGER')::INTEGER)
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $max_value_as_integer = $cols[0];
+
+ ### max_value_as_money6
+ $sth = $dbh->prepare("
+ SELECT MAX(NULLIF(migration_tools.attempt_money6($column,'-0.01'),-0.01))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $max_value_as_money6 = $cols[0];
+
+ ### max_value_as_money8
+ $sth = $dbh->prepare("
+ SELECT MAX(NULLIF(migration_tools.attempt_money($column,'-0.01'),-0.01))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $max_value_as_money8 = $cols[0];
+
+ ### max_value_as_date
+ $sth = $dbh->prepare("
+ SELECT MAX(NULLIF(migration_tools.attempt_date($column,'1969-06-09'),'1969-06-09'::DATE))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $max_value_as_date = $cols[0];
+
+ ### max_value_as_timestamptz
+ $sth = $dbh->prepare("
+ SELECT MAX(NULLIF(migration_tools.attempt_timestamptz($column,'1969-06-09'),'1969-06-09'::TIMESTAMPTZ))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ $max_value_as_timestamptz = $cols[0];
+
+ Mig::db_disconnect($dbh);
+
+ print "\n";
+ print "# of rows = $count\n";
+ print "# of non-empty rows = $non_empty_count\n";
+ print "# of distinct values (as text) = $distinct_value_count\n";
+ print "# of distinct values (as integer) = $distinct_integer_value_count\n";
+ print "# of distinct values (as money6) = $distinct_money6_value_count\n";
+ print "# of distinct values (as money8) = $distinct_money8_value_count\n";
+ print "# of distinct values (as date) = $distinct_date_value_count\n";
+ print "# of distinct values (as timestamptz) = $distinct_timestamptz_value_count\n";
+ print "\n";
+ print "minimum value (as text) = $min_value\n";
+ print "maximum value (as text) = $max_value\n";
+ print "\n";
+ print "minimum value length (as text) = $min_length (min value: $min_length_min_value)\n";
+ print "maximum value length (as text) = $max_length (max value: $max_length_max_value)\n";
+ print "\n";
+ print "minimum value (as integer) = " . ($min_value_as_integer ? $min_value_as_integer : '') . "\n";
+ print "maximum value (as integer) = " . ($max_value_as_integer ? $max_value_as_integer : '') . "\n";
+ print "\n";
+ print "minimum value (as money6) = " . ($min_value_as_money6 ? $min_value_as_money6 : '') . "\n";
+ print "maximum value (as money6) = " . ($max_value_as_money6 ? $max_value_as_money6 : '') . "\n";
+ print "\n";
+ print "minimum value (as money8) = " . ($min_value_as_money8 ? $min_value_as_money8 : '') . "\n";
+ print "maximum value (as money8) = " . ($max_value_as_money8 ? $max_value_as_money8 : '') . "\n";
+ print "\n";
+ print "minimum value (as date) = " . ($min_value_as_date ? $min_value_as_date : '') . "\n";
+ print "maximum value (as date) = " . ($max_value_as_date ? $max_value_as_date : '') . "\n";
+ print "\n";
+ print "minimum value (as timestamptz) = " . ($min_value_as_timestamptz ? $min_value_as_timestamptz : '') . "\n";
+ print "maximum value (as timestamptz) = " . ($max_value_as_timestamptz ? $max_value_as_timestamptz : '') . "\n";
+ print "\n";
+}
+
+sub group_browse {
+ my ($column,$option) = (shift,shift||"GROUP BY 1 ORDER BY 1");
+
+ my $dbh = Mig::db_connect();
+ my $sth;
+ my $rv;
+
+ $sth = $dbh->prepare(Mig::sql("
+ SELECT $column, COUNT(*)
+ FROM $MIGSCHEMA.$table
+ $option;
+ "));
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+
+ print "\n";
+ open LESS, "|less -F";
+ printf LESS "%-30s", "Value:";
+ print LESS "Count:\n\n";
+ while (my @cols = $sth->fetchrow_array) {
+ my $value = $cols[0];
+ my $count = $cols[1];
+ printf LESS "%-30s", defined $value ? $value : '<NULL>';
+ print LESS "$count\n";
+ }
+ close LESS;
+ print "\n";
+ $sth->finish;
+}
+
+###############################################################################
+
+sub add_this_column {
+ my $column = shift;
+ if (!Mig::check_for_tracked_column($table,$column)) {
+ my $dbh = Mig::db_connect();
+ my $rv = $dbh->do("
+ INSERT INTO $MIGSCHEMA.tracked_column (
+ base_filename
+ ,parent_table
+ ,staged_table
+ ,staged_column
+ ) VALUES (
+ " . $dbh->quote($file) . "
+ ," . $dbh->quote($fdata->{parent_table}) . "
+ ," . $dbh->quote($table) . "
+ ," . $dbh->quote($column) . "
+ );
+ ") || die "Error inserting into table $MIGSCHEMA.tracked_column: $!\n";
+ Mig::db_disconnect($dbh);
+ }
+}
+
+sub status_this_column {
+ my $column = shift;
+ my $data = Mig::status_this_column($table,$column);
+ if (!$data) {
+ add_this_column($column);
+ $data = Mig::status_this_column($table,$column);
+ }
+ if ($$data{parent_table}) {
+ my $dbh = Mig::db_connect();
+ my $sth = $dbh->prepare("
+ SELECT *
+ FROM $MIGSCHEMA.fields_requiring_mapping
+ WHERE table_name = " . $dbh->quote( $$data{parent_table} ) . "
+ AND column_name = " . $dbh->quote( $column ) . ";"
+ );
+ my $rv = $sth->execute()
+ || die "Error checking table (tracked_column) for $table.$column: $!";
+ my $data2 = $sth->fetchrow_hashref;
+ if ($data2) {
+ $$data{required} = 1;
+ } else {
+ $$data{required} = 0;
+ }
+ $sth->finish;
+ Mig::db_disconnect($dbh);
+ }
+ return $data;
+}
+
+sub set_comment {
+ my ($column,$comment) = (shift,shift);
+ if ($comment) {
+ my $data = status_this_column($column);
+ my $dbh = Mig::db_connect();
+ my $rv = $dbh->do("
+ UPDATE $MIGSCHEMA.tracked_column
+ SET comment = " . $dbh->quote($comment) . "
+ WHERE id = " . $dbh->quote($data->{id}) . ";
+ ") || die "Error updating table $MIGSCHEMA.tracked_column: $!\n";
+ Mig::db_disconnect($dbh);
+ }
+}
+
+sub set_transform {
+ my ($column,$transform) = (shift,shift);
+ if ($transform) {
+ my $data = status_this_column($column);
+ my $dbh = Mig::db_connect();
+ my $rv = $dbh->do("
+ UPDATE $MIGSCHEMA.tracked_column
+ SET transform = " . $dbh->quote($transform) . "
+ WHERE id = " . $dbh->quote($data->{id}) . ";
+ ") || die "Error updating table $MIGSCHEMA.tracked_column: $!\n";
+ Mig::db_disconnect($dbh);
+ }
+}
+
+sub set_target {
+ my ($column,$target) = (shift,shift);
+ my $target_table = $table;
+ my $target_column = $target;
+ if ($target) {
+ if ($target =~ /^(.+)\.(.+)$/) {
+ $target_table = $1;
+ $target_column = $2;
+ }
+ my $data = status_this_column($column);
+ my $dbh = Mig::db_connect();
+ my $rv = $dbh->do("
+ UPDATE $MIGSCHEMA.tracked_column
+ SET target_table = " . $dbh->quote($target_table) . "
+ ,target_column = " . $dbh->quote($target_column) . "
+ WHERE id = " . $dbh->quote($data->{id}) . ";
+ ") || die "Error updating table $MIGSCHEMA.tracked_column: $!\n";
+ Mig::db_disconnect($dbh);
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-quicksheet
+
+Quickly produces a simple Excel spreadsheet based on the tracked file suitable
+for simple end-user mapping. The new file is named after the tracked file, but
+ends in .mapping.xls
+
+=head1 SYNOPSIS
+
+B<mig-quicksheet> <file>
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Spreadsheet::WriteExcel;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+my $has_x_source = 0;
+my $file = abs_path($ARGV[0]);
+my $fdata;
+my $tracked_file_id = Mig::check_for_tracked_file($file);
+if ($tracked_file_id) {
+ $fdata = Mig::status_this_file($file);
+} else {
+ die "File not currently tracked: $file\n";
+}
+
+my $table = $fdata->{staged_table};
+if (!$table) {
+ die "No staged staged table for file: $file\n";
+}
+
+my $workbook = Spreadsheet::WriteExcel->new($file . '.mapping.xls');
+my @worksheets = ();
+print "Writing $file.mapping.xls\n";
+print "Sheet: Field Summary\n";
+my $first_sheet = $workbook->add_worksheet('Field Summary');
+my $bold = $workbook->add_format();
+$bold->set_bold();
+$bold->set_align('left');
+my $left = $workbook->add_format();
+$left->set_align('left');
+$first_sheet->set_column(0,6,30);
+
+my $dbh = Mig::db_connect();
+my $sth = $dbh->prepare("
+ SELECT COUNT(*)
+ FROM $MIGSCHEMA.$table
+ LIMIT 1;
+");
+my $rv = $sth->execute()
+ || die "Error retrieving data from information_schema: $!";
+
+my @cols = $sth->fetchrow_array;
+$sth->finish;
+my $count = $cols[0];
+Mig::db_disconnect($dbh);
+
+$first_sheet->write(0,0,'Source File:',$bold);
+$first_sheet->write(0,1,$file,$left);
+$first_sheet->write(1,0,'Number of Rows:',$bold);
+$first_sheet->write(1,1,$count,$left);
+
+my $sheet_row_start = 4;
+
+$first_sheet->write($sheet_row_start,0,'Legacy Column',$bold);
+$first_sheet->write($sheet_row_start,1,'Non-Empty Rows',$bold);
+$first_sheet->write($sheet_row_start,2,'Distinct Non-NULL Values',$bold);
+$first_sheet->write($sheet_row_start,3,'Min Value',$bold);
+$first_sheet->write($sheet_row_start,4,'Min Length',$bold);
+$first_sheet->write($sheet_row_start,5,'Max Value',$bold);
+$first_sheet->write($sheet_row_start,6,'Max Length',$bold);
+
+handle_columns();
+
+$workbook->close();
+
+exit 0;
+
+###############################################################################
+
+sub handle_columns {
+ my $dbh = Mig::db_connect();
+ my $sth = $dbh->prepare("
+ SELECT *
+ FROM information_schema.columns
+ WHERE table_schema = " . $dbh->quote($MIGSCHEMA) . "
+ AND table_name = " . $dbh->quote($table) . "
+ ORDER BY dtd_identifier::INTEGER ASC;
+ ");
+ my $rv = $sth->execute()
+ || die "Error retrieving data from information_schema: $!";
+
+ my $sheet_row_offset = 0;
+
+ while (my $data = $sth->fetchrow_hashref) {
+ my $column = $data->{column_name};
+ if ($column eq 'x_source') {
+ $has_x_source = 1;
+ }
+ if ($column =~ /^l_/
+ || ($column =~ /^x_/
+ && ( $column ne 'x_migrate'
+ && $column ne 'x_source'
+ && $column ne 'x_egid'
+ && $column ne 'x_hseq'
+ )
+ )
+ ) {
+ $sheet_row_offset++;
+ my $cdata = column_summary($column);
+ $first_sheet->write($sheet_row_start + $sheet_row_offset,0,$column,$left);
+ $first_sheet->write($sheet_row_start + $sheet_row_offset,1,$cdata->{non_empty_count},$left);
+ $first_sheet->write($sheet_row_start + $sheet_row_offset,2,$cdata->{distinct_value_count},$left);
+ $first_sheet->write($sheet_row_start + $sheet_row_offset,3,$cdata->{min_value},$left);
+ $first_sheet->write($sheet_row_start + $sheet_row_offset,4,$cdata->{min_length},$left);
+ $first_sheet->write($sheet_row_start + $sheet_row_offset,5,$cdata->{max_value},$left);
+ $first_sheet->write($sheet_row_start + $sheet_row_offset,6,$cdata->{max_length},$left);
+ if ($cdata->{distinct_value_count} > 1 && $cdata->{distinct_value_count} <= 500) {
+ group_by($column);
+ }
+ }
+ }
+ $sth->finish;
+ Mig::db_disconnect($dbh);
+}
+
+sub column_summary {
+
+ my $column = shift;
+
+ my $dbh = Mig::db_connect();
+
+ ### non_empty_count
+ my $sth = $dbh->prepare("
+ SELECT COUNT(*)
+ FROM $MIGSCHEMA.$table
+ WHERE $column IS NOT NULL AND BTRIM($column) <> '';
+ ");
+ my $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ my @cols = $sth->fetchrow_array;
+ $sth->finish;
+ my $non_empty_count = $cols[0];
+
+ ### distinct_value_count
+ $sth = $dbh->prepare("
+ SELECT COUNT(DISTINCT $column)
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ my $distinct_value_count = $cols[0];
+
+ ### min_value
+ $sth = $dbh->prepare("
+ SELECT MIN($column)
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ my $min_value = $cols[0];
+
+ ### min_length
+ $sth = $dbh->prepare("
+ SELECT MIN(LENGTH($column))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ my $min_length = $cols[0];
+
+ ### max_value
+ $sth = $dbh->prepare("
+ SELECT MAX($column)
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ my $max_value = $cols[0];
+
+ ### max_length
+ $sth = $dbh->prepare("
+ SELECT MAX(LENGTH($column))
+ FROM $MIGSCHEMA.$table;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+ @cols = $sth->fetchrow_array;
+ $sth->finish;
+ my $max_length = $cols[0];
+
+ return {
+ non_empty_count => $non_empty_count
+ ,distinct_value_count => $distinct_value_count
+ ,min_value => defined $min_value ? $min_value : '<NULL>'
+ ,min_length => defined $min_length ? $min_length : '<NULL>'
+ ,max_value => defined $max_value ? $max_value : '<NULL>'
+ ,max_length => defined $max_length ? $max_length : '<NULL>'
+ };
+}
+
+sub group_by {
+ my ($column,$option) = (shift,"GROUP BY 2 ORDER BY 2");
+
+ print "Sheet: $column\n";
+ my $col_sheet = $workbook->add_worksheet(substr($column,0,31));
+ push @worksheets, $col_sheet;
+ $col_sheet->set_column(0,6,30);
+
+ my $col_sheet_row_start = 0;
+ my $col_sheet_row_offset = 0;
+
+ $col_sheet->write($col_sheet_row_start + $col_sheet_row_offset,0,'Count',$bold);
+ if ($has_x_source) {
+ $col_sheet->write($col_sheet_row_start + $col_sheet_row_offset,1,'Source',$bold);
+ $option = "GROUP BY 2,3 ORDER BY 2,3";
+ }
+ $col_sheet->write(
+ $col_sheet_row_start + $col_sheet_row_offset
+ ,$has_x_source ? 2 : 1
+ ,"Legacy Value for $column"
+ ,$bold
+ );
+
+ my $dbh = Mig::db_connect();
+ my $sth;
+ my $rv;
+
+ $sth = $dbh->prepare("
+ SELECT COUNT(*), " . ($has_x_source ? 'x_source, ' : '') . "$column
+ FROM $MIGSCHEMA.$table
+ $option;
+ ");
+ $rv = $sth->execute()
+ || die "Error retrieving data from $MIGSCHEMA.$table: $!";
+
+ while (my @cols = $sth->fetchrow_array) {
+ $col_sheet_row_offset++;
+ my $count = $cols[0];
+ $col_sheet->write($col_sheet_row_start + $col_sheet_row_offset,0,$count,$left);
+ my $value;
+ if ($has_x_source) {
+ my $source = defined $cols[1] ? $cols[1] : '<NULL>';
+ $col_sheet->write($col_sheet_row_start + $col_sheet_row_offset,1,$source,$left);
+ $value = defined $cols[2] ? $cols[2] : '<NULL>';
+ $col_sheet->write($col_sheet_row_start + $col_sheet_row_offset,2,$value,$left);
+ } else {
+ $value = defined $cols[1] ? $cols[1] : '<NULL>';
+ $col_sheet->write($col_sheet_row_start + $col_sheet_row_offset,1,$value,$left);
+ }
+ }
+ $sth->finish;
+}
+
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-remove - This will remove the specified files from the mig tracking table
+for the schema pointed to by the MIGSCHEMA environment variable in the
+PostgreSQL database specified by various PG environment variables. <inhales,
+exhales, phew>
+
+You'll need to invoke B<mig-init> prior to using commands like B<mig-remove>
+
+=head1 SYNOPSIS
+
+B<mig-remove> <file> [file] [...]
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+foreach my $arg (@ARGV) {
+ my $file = abs_path($arg);
+ if ($file =~ /^$MIGBASEWORKDIR/) {
+ remove_this_file($file);
+ } else {
+ print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n";
+ }
+}
+
+exit 0;
+
+###############################################################################
+
+sub remove_this_file {
+ my $file = shift;
+ my $tracked_file_id = Mig::check_for_tracked_file($file,{'allow_missing'=>1});
+ if ($tracked_file_id) {
+ print "removing tracked file: $file\n";
+ my $dbh = Mig::db_connect();
+ my $rv = $dbh->do("
+ DELETE FROM $MIGSCHEMA.tracked_file WHERE id = $tracked_file_id;
+ ") || die "Error deleting from table $MIGSCHEMA.tracked_file (id = $tracked_file_id): $!\n";
+ Mig::db_disconnect($dbh);
+ } else {
+ print "File not currently tracked: $file\n";
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-skip-clean
+
+Allows you to either use an existing file named <file>.utf8.clean or a
+named [cleaned file] as if it were the one created by mig-clean
+
+Note that the clean file, however specified, should contain headers. The
+remaining tools in the chain will expect this.
+
+=head1 SYNOPSIS
+
+B<mig-skip-clean> <file> [cleaned file]
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if ! ($ARGV[0]||$ARGV[1]) || $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+my $file = abs_path($ARGV[0]);
+my $clean_file;
+if ($ARGV[1]) {
+ $clean_file = abs_path($ARGV[1]);
+}
+if ($clean_file && ! $clean_file =~ /^$MIGBASEWORKDIR/) {
+ die "File falls outside of MIGWORKDIR ($MIGWORKDIR): $clean_file\n";
+}
+
+if ($file =~ /^$MIGBASEWORKDIR/) {
+ skip_clean($file,$clean_file);
+} else {
+ die "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n";
+}
+
+exit 0;
+
+###############################################################################
+
+sub skip_clean {
+ my $file = shift;
+ my $clean_file = shift;
+
+ my $tracked_file_id = Mig::check_for_tracked_file($file);
+ if ($tracked_file_id) {
+ my $data = Mig::status_this_file($file);
+
+ if (! $data->{'utf8_filename'}) {
+ die "mig-iconv or mig-skip-iconv needed for UTF8 version of file: $file\n";
+ }
+
+ my $utf8_file = $data->{'utf8_filename'};
+ if (! -e $utf8_file) {
+ die "missing file: $utf8_file\n";
+ }
+
+ print "skipping cleaning of tracked file: $file\n";
+
+ my $dbh = Mig::db_connect();
+ if (! $clean_file) {
+ $clean_file = $utf8_file . '.clean';
+ }
+ if (! -e $clean_file) {
+ die "clean file does not exist: $clean_file\n";
+ }
+
+ my $rv = $dbh->do("
+ UPDATE $MIGSCHEMA.tracked_file
+ SET clean_filename = " . $dbh->quote($clean_file) . "
+ WHERE base_filename = " . $dbh->quote($file) . "
+ ;
+ ") || die "Error inserting into table $MIGSCHEMA.tracked_file: $!\n";
+ Mig::db_disconnect($dbh);
+ } else {
+ die "File not currently tracked: $file\n";
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-skip-iconv
+
+Allows you to either use an existing file named <file>.utf8 or a named
+[utf8 file] as if it were the one created by mig-iconv
+
+=head1 SYNOPSIS
+
+B<mig-skip-iconv> <file> [utf8 file]
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if ! ($ARGV[0]||$ARGV[1]) || $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+my $file = abs_path($ARGV[0]);
+my $utf8_file;
+if ($ARGV[1]) {
+ $utf8_file = abs_path($ARGV[1]);
+}
+if ($utf8_file && ! $utf8_file =~ /^$MIGBASEWORKDIR/) {
+ die "File falls outside of MIGWORKDIR ($MIGWORKDIR): $utf8_file\n";
+}
+
+if ($file =~ /^$MIGBASEWORKDIR/) {
+ skip_iconv($file,$utf8_file);
+} else {
+ print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n";
+}
+
+exit 0;
+
+###############################################################################
+
+sub skip_iconv {
+ my $file = shift;
+ my $utf8_file = shift;
+
+ my $tracked_file_id = Mig::check_for_tracked_file($file);
+ if ($tracked_file_id) {
+ my $data = Mig::status_this_file($file);
+ print "skipping the iconv'ing of tracked file: $file\n";
+
+ my $dbh = Mig::db_connect();
+ if (! $utf8_file) {
+ $utf8_file = $file . '.utf8';
+ }
+ if (! -e $utf8_file) {
+ die "utf8 file does not exist: $utf8_file\n";
+ }
+
+ my $rv = $dbh->do("
+ UPDATE $MIGSCHEMA.tracked_file
+ SET utf8_filename = " . $dbh->quote($utf8_file) . "
+ WHERE base_filename = " . $dbh->quote($file) . "
+ ;
+ ") || die "Error inserting into table $MIGSCHEMA.tracked_file: $!\n";
+ Mig::db_disconnect($dbh);
+ } else {
+ print "File not currently tracked: $file\n";
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-stage
+
+Load the SQL-converted version of the specified file into the migration schema.
+
+Extra arguments are passed to the underlying call to psql
+
+If the tracked file was previously staged with a different table, drop that
+table.
+
+
+=head1 SYNOPSIS
+
+B<mig-stage> <file> [other arguments...]
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+my $file = abs_path($ARGV[0]);
+if ($file =~ /^$MIGBASEWORKDIR/) {
+ stage_csv(@ARGV);
+} else {
+ print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n";
+}
+
+exit 0;
+
+###############################################################################
+
+sub stage_csv {
+ my $file = abs_path(shift);
+ my @args = @_;
+
+ my $tracked_file_id = Mig::check_for_tracked_file($file);
+ if ($tracked_file_id) {
+ my $data = Mig::status_this_file($file);
+
+ if (! $data->{'utf8_filename'}) {
+ die "mig-iconv or mig-skip-iconv needed for UTF8 version of file: $file\n";
+ }
+
+ if (! $data->{'clean_filename'}) {
+ die "mig-clean or mig-skip-clean needed for .clean version of file: $file\n";
+ }
+
+ if (! $data->{'stage_sql_filename'}) {
+ die "mig-convert needed for .stage.sql version of file: $file\n";
+ }
+
+ my $stage_sql_filename = $data->{'stage_sql_filename'};
+ if (! -e $stage_sql_filename) {
+ die "missing file: $stage_sql_filename\n";
+ }
+
+ my $schema_table = `grep 'CREATE UNLOGGED TABLE' $stage_sql_filename | cut -f4 -d\\ | head -1`;
+ chomp $schema_table;
+ my ($schema,$table) = split /\./, $schema_table;
+
+ if (defined $data->{'staged_table'} && $data->{'staged_table'} ne $table) {
+ my $dbh2 = Mig::db_connect();
+ print "dropping previously staged table: $MIGSCHEMA.$data->{staged_table}\n";
+ my $rv2 = $dbh2->do("
+ DROP TABLE $MIGSCHEMA.$data->{staged_table};
+ ") || die "Error dropping table $data->{staged_table}: $!\n";
+ print "changing references to old tables\n";
+ my $rv3 = $dbh2->do("
+ UPDATE $MIGSCHEMA.tracked_column
+ SET staged_table = " . $dbh2->quote($table) . "
+ WHERE staged_table = " . $dbh2->quote($data->{staged_table}) . "
+ ") || die "Error changing references to $data->{staged_table}: $!\n";
+ my $rv4 = $dbh2->do("
+ UPDATE $MIGSCHEMA.tracked_column
+ SET target_table = " . $dbh2->quote($table) . "
+ WHERE target_table = " . $dbh2->quote($data->{staged_table}) . "
+ ") || die "Error changing references to $data->{staged_table}: $!\n";
+ Mig::db_disconnect($dbh2);
+ }
+
+ print "running staging SQL: $stage_sql_filename\n";
+
+ system('psql', @args, '-f', $stage_sql_filename);
+
+ if ($schema ne $MIGSCHEMA) {
+ die "Schema mismatch: env => $MIGSCHEMA sql => $schema\n";
+ }
+ if (! Mig::check_db_migschema_for_specific_table($table)) {
+ die "Missing staged table: $schema_table\n";
+ } else {
+ print "table staged: $schema_table\n";
+ }
+
+ my $dbh = Mig::db_connect();
+ my $rv = $dbh->do("
+ UPDATE $MIGSCHEMA.tracked_file
+ SET staged_table = " . $dbh->quote($table) . "
+ WHERE base_filename = " . $dbh->quote($file) . "
+ ;
+ ") || die "Error updating table $MIGSCHEMA.tracked_file: $!\n";
+ Mig::db_disconnect($dbh);
+ } else {
+ print "File not currently tracked: $file\n";
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-status - This will show tracking information for either the specified files
+or all tracked files if no argument is given.
+
+You'll need to invoke B<mig-init> prior to using commands like B<mig-status>
+
+=head1 SYNOPSIS
+
+B<mig-status> [file] [...]
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if scalar(@ARGV) > 0 && $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+my @files = @ARGV;
+if (scalar(@files) == 0) {
+ @files = ();
+ my $dbh = Mig::db_connect();
+ my $sth = $dbh->prepare("
+ SELECT base_filename
+ FROM $MIGSCHEMA.tracked_file
+ ORDER BY 1;"
+ );
+ my $rv = $sth->execute()
+ || die "Error retrieving data from table (tracked_file): $!";
+ my $rows = $sth->fetchall_arrayref;
+ for my $row ( @$rows ) {
+ push @files, $row->[0]
+ }
+ $sth->finish;
+ Mig::db_disconnect($dbh);
+}
+
+foreach my $arg (sort @files) {
+ my $file = abs_path($arg);
+ my $data = Mig::status_this_file($file);
+ print "=-=-=\n";
+ foreach my $key (
+ 'base_filename'
+ ,'has_headers'
+ ,'headers_file'
+ ,'utf8_filename'
+ ,'clean_filename'
+ ,'parent_table'
+ ,'stage_sql_filename'
+ ,'staged_table'
+ ,'map_sql_filename'
+ ,'prod_sql_filename'
+ ) {
+ printf "%-20s:\t", $key;
+ print $data->{$key} ? $data->{$key} : "";
+ if ($key =~ /filename$/ && $data->{$key} && ! -e $data->{$key}) {
+ print " (FILE MISSING)";
+ }
+ print "\n";
+ }
+}
+
+exit 0;
+
+###############################################################################
+
+
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+mig-unlink
+
+Clear any association between the specified file and a parent table within the
+migration schema.
+
+=head1 SYNOPSIS
+
+B<mig-unlink> <file>
+
+=cut
+
+###############################################################################
+
+use strict;
+use Switch;
+use Env qw(
+ HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
+ MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
+);
+use Pod::Usage;
+use DBI;
+use Cwd 'abs_path';
+use FindBin;
+my $mig_bin = "$FindBin::Bin/";
+use lib "$FindBin::Bin/";
+use Mig;
+
+pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help';
+
+Mig::die_if_no_env_migschema();
+Mig::die_if_mig_tracking_table_does_not_exist();
+
+my $file = abs_path($ARGV[0]);
+if ($file =~ /^$MIGBASEWORKDIR/) {
+ unlink_table(@ARGV);
+} else {
+ print "File falls outside of MIGWORKDIR ($MIGWORKDIR): $file\n";
+}
+
+exit 0;
+
+###############################################################################
+
+sub unlink_table {
+ my $file = abs_path(shift);
+
+ my $tracked_file_id = Mig::check_for_tracked_file($file);
+ if ($tracked_file_id) {
+ my $data = Mig::status_this_file($file);
+ my $table = $data->{'parent_table'} || '';
+
+ print "unlinking table ($table) from file: $file\n";
+
+ my $dbh = Mig::db_connect();
+ my $rv = $dbh->do("
+ UPDATE $MIGSCHEMA.tracked_file
+ SET parent_table = ''
+ WHERE base_filename = " . $dbh->quote($file) . "
+ ;
+ ") || die "Error updating table $MIGSCHEMA.tracked_file: $!\n";
+ Mig::db_disconnect($dbh);
+ } else {
+ print "File not currently tracked: $file\n";
+ }
+}
--- /dev/null
+#!/usr/bin/perl -w
+use Storable;
+use Switch;
+use Getopt::Long;
+use Text::CSV_XS;
+use Text::CSV::Separator qw(get_separator);
+use Data::Dumper;
+use Term::ANSIColor;
+
+# may be manipulated with --config
+our %CSV_options = (
+ binary => 1,
+ auto_diag => 1,
+ diag_verbose => 1,
+);
+my $csv;
+our $fixes = {
+ 'R' => [],
+ 'I' => [],
+ 'D' => []
+};
+my @parsed_rows;
+my @lines_with_errors = ();
+my %line_numbers_for_lines_with_errors = ();
+my $expected_column_count;
+my $line_no;
+
+# GetOpt variables
+my $config;
+my $id_cols;
+my $fix;
+my $nosave;
+my $apply;
+my $pad;
+my $truncate;
+my $backslash;
+my $debug;
+my $help;
+my $create_headers;
+my $headers_file;
+my @headers = ();
+
+my $pad_count = 0; my $trunc_count = 0; my $fix_count = 0; my $backslash_count = 0;
+
+################################################################## Subs
+
+sub format_for_display {
+ my $formatted_line = shift;
+ my $sep_char = $CSV_options{sep_char} || '\t';
+ my $sep = color 'bold blue';
+ $sep .= '<' . (ord($sep_char) < 32 ? ord($sep_char) : $sep_char) . '>';
+ $sep .= color 'reset';
+ my $quote_char = $CSV_options{quote_char} || '';
+ my $quote = color 'bold red';
+ $quote .= '<' . (ord($quote_char) < 32 ? ord($quote_char) : $quote_char) . '>';
+ $quote .= color 'reset';
+ my $escape_char = $CSV_options{escape_char} || '';
+ my $escape = color 'bold green';
+ $escape .= '<' . (ord($escape_char) < 32 ? ord($escape_char) : $escape_char) . '>';
+ $escape .= color 'reset';
+ my $real_escape_char = chr(27);
+ my $real_escape = color 'bold green';
+ $real_escape .= '<27>';
+ $real_escape .= color 'reset';
+
+ $formatted_line =~ s/$real_escape_char/$real_escape/g;
+ $formatted_line =~ s/$sep_char/$sep/g;
+ $formatted_line =~ s/$quote_char/$quote/g;
+ $formatted_line =~ s/$escape_char/$escape/g;
+ for (my $i = 0; $i < 32; $i++) {
+ if ($i == 27) { next; }
+ my $other_char = chr($i);
+ my $other = color 'yellow';
+ $other .= "<$i>";
+ $other .= color 'reset';
+ $formatted_line =~ s/$other_char/$other/g;
+ }
+ return "$formatted_line\n";
+}
+
+sub combine_cols {
+ my $row = shift;
+ my $status = $csv->combine(@{ $row });
+ if ($status && $csv->string) {
+ return $csv->string . "\n";
+ } else {
+ die $csv->error_input . "\n";
+ }
+}
+
+sub convert_backslashes {
+ my $line = shift;
+ my $altered_line;
+ my @count = $line =~ /\\/g;
+ if (scalar(@count) > 0) {
+ my $csv2 = Text::CSV_XS->new(\%CSV_options);
+ if ($csv2->parse($line)) {
+ my @columns = $csv2->fields();
+ foreach my $c (@columns) {
+ if ($c ne '\N') {
+ $c =~ s/\\/\//g;
+ }
+ }
+ $altered_line = combine_cols(\@columns);
+ } else {
+ $altered_line =~ s/\\/\//g;
+ }
+ if ($line ne $altered_line) {
+ $backslash_count += scalar(@count);
+ print "\nline#$line_no>> Converting " . scalar(@count) . " backslashes to forward slashes\n";
+ print "before: " . format_for_display($line);
+ $line = $altered_line;
+ print " after: " . format_for_display($line);
+ }
+ }
+ return $line;
+}
+
+sub apply_line_fixes {
+ my $line = shift;
+ foreach my $fix ( @{$fixes->{'R'}} ) {
+ my $id_regex = $fix->[0];
+ if ($line =~ /$id_regex/) {
+ print "\nline#$line_no>> Applying regex fix for $id_regex\n";
+ $fix_count++;
+ my $regex1 = $fix->[1];
+ my $regex2 = $fix->[2];
+ my $global = $fix->[3];
+ my $ignore_case = $fix->[4];
+ print "before: " . format_for_display($line);
+ $line = fix_via_regex($id_regex,$line,$regex1,$regex2,$global,$ignore_case);
+ print " after: " . format_for_display($line);
+ }
+ }
+ return $line;
+}
+
+sub apply_insert_fixes {
+ my $line = shift;
+ my $cols = shift;
+ foreach my $fix ( @{$fixes->{'I'}} ) {
+ my $id_regex = $fix->[0];
+ my $col_count_check = $fix->[1];
+ if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
+ print "\nline#$line_no>> Applying insert for $id_regex\n";
+ $fix_count++;
+ my $fix_cols = $fix->[2];
+ my $fix_value = $fix->[3];
+ print "before: " . format_for_display($line);
+ $line = fix_via_insert($id_regex,$cols,$col_count_check,$fix_cols,$fix_value);
+ print " after: " . format_for_display($line);
+ }
+ }
+ return $line;
+}
+
+sub apply_delete_fixes {
+ my $line = shift;
+ my $cols = shift;
+ foreach my $fix ( @{$fixes->{'D'}} ) {
+ my $id_regex = $fix->[0];
+ my $col_count_check = $fix->[1];
+ if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
+ print "\nline#$line_no>> Applying delete for $id_regex\n";
+ $fix_count++;
+ my $fix_cols = $fix->[2];
+ print "before: " . format_for_display($line);
+ $line = fix_via_delete($id_regex,$cols,$col_count_check,$fix_cols);
+ print " after: " . format_for_display($line);
+ }
+ }
+ return $line;
+}
+
+sub apply_join_fixes {
+ my $line = shift;
+ my $cols = shift;
+ foreach my $fix ( @{$fixes->{'J'}} ) {
+ my $id_regex = $fix->[0];
+ my $col_count_check = $fix->[1];
+ if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
+ print "\nline#$line_no>> Applying join for $id_regex\n";
+ $fix_count++;
+ my $fix_cols = $fix->[2];
+ print "before: " . format_for_display($line);
+ $line = fix_via_join($id_regex,$cols,$col_count_check,$fix_cols);
+ print " after: " . format_for_display($line);
+ }
+ }
+ return $line;
+}
+
+sub save_fix {
+ print "saving fix...";
+ my $type = shift;
+ my $fix = shift;
+ if ($nosave) { print "psyche!\n"; return; }
+ print "fix = " . Dumper($fix) . "\n" if $debug;
+ push @{$fixes->{$type}}, $fix;
+ store $fixes, $ARGV[0] . '.fixes';
+ print "saved\n";
+}
+
+sub id_cols_regex {
+ my $cols = shift;
+ my @f = sort(split /,/, $id_cols || '0');
+ my $regex = '';
+ for (my $i = 0; $i < scalar(@f); $i++) {
+ if ($i > 0) {
+ $regex .= '.+'; # characters between id columns
+ }
+ $regex .= '.?' . $cols->[$f[$i]] . '.?';
+ }
+ $regex .= '';
+ return $regex;
+}
+
+sub fix_via_regex {
+ my $id_regex = shift;
+ my $line = shift;
+ my $fix_regex1 = shift;
+ my $fix_regex2 = shift;
+ my $global = shift;
+ my $ignore_case = shift;
+ my $save;
+
+ if (!$fix_regex1) {
+global_prompt:
+ print "Global (aka s/match/replace/g)? <Yes/No> [n] ";
+ $global = readline(STDIN); chomp $global;
+ $global = uc(substr($global,0,1));
+ if ($global eq '') {
+ $global = 'N';
+ }
+ $global = uc(substr($global,0,1));
+ if ($global ne 'Y' && $global ne 'N') {
+ goto global_prompt;
+ }
+case_prompt:
+ print "Ignore-case (aka s/match/replace/i)? <Yes/No> [n] ";
+ $ignore_case = readline(STDIN); chomp $ignore_case;
+ $ignore_case = uc(substr($ignore_case,0,1));
+ if ($ignore_case eq '') {
+ $ignore_case = 'N';
+ }
+ $ignore_case = uc(substr($ignore_case,0,1));
+ if ($ignore_case ne 'Y' && $ignore_case ne 'N') {
+ goto case_prompt;
+ }
+
+regex1_prompt:
+ print "Enter match regex for s/match/replace/: ";
+ $fix_regex1 = readline(STDIN); chomp $fix_regex1;
+ if ($fix_regex1 eq '') {
+ goto global_prompt;
+ }
+ if (
+ ($global eq 'Y' && $ignore_case eq 'Y' && $line =~ /$fix_regex1/gi)
+ || ($global eq 'Y' && $ignore_case eq 'N' && $line =~ /$fix_regex1/g)
+ || ($global eq 'N' && $ignore_case eq 'N' && $line =~ /$fix_regex1/i)
+ ) {
+ print "Regex matches line.\n";
+ } else {
+ print "Regex does not match line.\n";
+ goto regex1_prompt;
+ }
+regex2_prompt:
+ print "Enter replace regex for s/match/replace/: ";
+ $fix_regex2 = readline(STDIN); chomp $fix_regex2;
+ if (substr($fix_regex1,-1) eq '$') {
+ print "Adding new line to end of /$fix_regex2/ based on \$ in /$fix_regex1/\n";
+ $fix_regex2 .= "\n";
+ }
+ # TODO - how to do we handle backreferences with this?
+ $save = 1;
+ }
+
+ switch ($global . $ignore_case) {
+ case 'YY' { $line =~ s/$fix_regex1/$fix_regex2/gi; }
+ case 'YN' { $line =~ s/$fix_regex1/$fix_regex2/g; }
+ case 'NY' { $line =~ s/$fix_regex1/$fix_regex2/i; }
+ case 'NN' { $line =~ s/$fix_regex1/$fix_regex2/; }
+ }
+
+ if ($save) {
+ save_fix('R',[
+ $id_regex,
+ $fix_regex1,
+ $fix_regex2,
+ $global,
+ $ignore_case
+ ]);
+ }
+
+ return $line;
+}
+
+sub fix_via_insert {
+ my $id_regex = shift;
+ my $cols = shift;
+ my $col_count_check = shift;
+ my $fix_cols = shift;
+ my $fix_value = shift;
+ my $line;
+ my $save;
+
+ if (!$fix_cols) {
+ $col_count_check = scalar(@{$cols});
+ print "This fix will only trigger when the number of columns is $col_count_check.\n";
+ print "Enter value to insert: [] ";
+ $fix_value = readline(STDIN); chomp $fix_value;
+ print "Enter comma-separated list of column positions (0-based) for insertion: ";
+ $fix_cols = readline(STDIN); chomp $fix_cols;
+ $save = 1;
+ }
+
+ if ($col_count_check != scalar(@{$cols})) {
+ print "WARNING: Insert column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ". Skipping.\n";
+ return $line;
+ }
+
+ my @f = sort(split /,/, $fix_cols);
+ for (my $i = 0; $i < scalar(@f); $i++) {
+ splice @{ $cols }, $f[$i] + $i, 0, $fix_value;
+ }
+
+ eval {
+ $line = combine_cols($cols);
+ };
+ if ($@) {
+ print "fix_via_insert error:\n";
+ die $@;
+ }
+
+ if ($save) {
+ save_fix('I',[
+ $id_regex,
+ $col_count_check,
+ $fix_cols,
+ $fix_value
+ ]);
+ }
+
+ return $line;
+}
+
+sub fix_via_delete {
+ my $id_regex = shift;
+ my $cols = shift;
+ my $col_count_check = shift;
+ my $fix_cols = shift;
+ my $line;
+ my $save;
+
+ if (!$fix_cols) {
+ $col_count_check = scalar(@{$cols});
+ print "This fix will only trigger when the number of columns is $col_count_check.\n";
+ print "Enter comma-separated list of column positions (0-based) to delete: ";
+ $fix_cols = readline(STDIN); chomp $fix_cols;
+ $save = 1;
+ }
+
+ if ($col_count_check != scalar(@{$cols})) {
+ print "WARNING: Delete column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ". Skipping.\n";
+ return $line;
+ }
+
+ my @f = sort(split /,/, $fix_cols);
+ for (my $i = 0; $i < scalar(@f); $i++) {
+ splice @{ $cols }, $f[$i] - $i, 1;
+ }
+
+ eval {
+ $line = combine_cols($cols);
+ };
+ if ($@) {
+ print "fix_via_delete error:\n";
+ die $@;
+ }
+
+ if ($save) {
+ save_fix('D',[
+ $id_regex,
+ $col_count_check,
+ $fix_cols
+ ]);
+ }
+
+ return $line;
+}
+
+sub fix_via_join {
+ my $id_regex = shift;
+ my $cols = shift;
+ my $col_count_check = shift;
+ my $fix_cols = shift;
+ my $line;
+ my $save;
+
+ if (!$fix_cols) {
+ $col_count_check = scalar(@{$cols});
+ print "This fix will only trigger when the number of columns is $col_count_check.\n";
+ print "Enter comma-separated list of column positions (0-based) to join: ";
+ $fix_cols = readline(STDIN); chomp $fix_cols;
+ $save = 1;
+ }
+
+ if ($col_count_check != scalar(@{$cols})) {
+ print "WARNING: Join column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ". Skipping.\n";
+ return $line;
+ }
+
+ # gather
+ my $fix_value = '';
+ my @f = sort { $a <=> $b } (split /,/, $fix_cols);
+ for (my $i = 0; $i < scalar(@f); $i++) {
+ $fix_value .= @{ $cols }[$f[$i]];
+ }
+
+ # delete
+ for (my $i = 0; $i < scalar(@f); $i++) {
+ splice @{ $cols }, $f[$i] - $i, 1;
+ }
+
+ # insert
+ splice @{ $cols }, $f[0], 0, $fix_value;
+
+ eval {
+ $line = combine_cols($cols);
+ };
+ if ($@) {
+ print "fix_via_join error:\n";
+ die $@;
+ }
+
+ if ($save) {
+ save_fix('J',[
+ $id_regex,
+ $col_count_check,
+ $fix_cols
+ ]);
+ }
+
+ return $line;
+}
+
+
+sub manual_fix {
+ my $line = shift;
+ my $cols = shift;
+ my $insert_delete_allowed = shift;
+ my $col_count = scalar(@{$cols}) > scalar(@headers) ? scalar(@{$cols}) : scalar(@headers);
+ my $max_header_length = 0;
+
+ # display columns nice and formatted
+ for (my $i = 0; $i < scalar(@headers); $i++) {
+ if (length($headers[$i]) > $max_header_length) {
+ $max_header_length = length($headers[$i]);
+ }
+ }
+ for (my $i = 0; $i < $col_count; $i++) {
+ printf "#% d ", $i;
+ if (defined $headers[$i]) {
+ print " " x ($max_header_length - length($headers[$i]));
+ print $headers[$i];
+ } else {
+ print " " x $max_header_length;
+ }
+ print ": ";
+ if (defined $cols->[$i]) {
+ print "<" . $cols->[$i] . ">";
+ }
+ print "\n";
+ }
+
+ # prompt for type of fix
+fix_prompt:
+ print "\n" . format_for_display($line) . "\nFix line#$line_no? <Regex" . ($insert_delete_allowed ? '|Insert|Delete|Join' : '') . "|No> [n] ";
+ my $ans = readline(STDIN); chomp $ans;
+ $ans = uc(substr($ans,0,1));
+ if ($ans eq '') {
+ $ans = 'N';
+ }
+ if ($insert_delete_allowed ? index("RIDJN",$ans)==-1 : index("RN",$ans)==-1) {
+ goto fix_prompt;
+ }
+
+ # prompt for matching condition
+ my $id_regex;
+ if ($ans ne 'N') {
+ my $default_id_regex = id_cols_regex($cols);
+match_prompt:
+ print "If matching the end of the string, you may need to use \\s*\$ instead of \$\n";
+ print "Insert/Delete/Join fixes will also filter on column count.\n";
+ print "Identify this line (and optionally similar lines) with regex: [$default_id_regex] ";
+ $id_regex = readline(STDIN); chomp $id_regex;
+ if ($id_regex eq '') {
+ $id_regex = $default_id_regex;
+ }
+ if ($line =~ /$id_regex/) {
+ print "Regex matches line.\n";
+ } else {
+ print "Regex does not match line.\n";
+ goto match_prompt;
+ }
+ }
+
+ # prompt and perform actual fixes
+ switch($ans) {
+ case 'R' { $line = fix_via_regex($id_regex,$line); }
+ case 'I' { $line = fix_via_insert($id_regex,$cols); }
+ case 'D' { $line = fix_via_delete($id_regex,$cols); }
+ case 'J' { $line = fix_via_join($id_regex,$cols); }
+ case 'N' { }
+ else { $ans = 'N'; }
+ }
+
+ if ($ans ne 'N') {
+ print "\nNew line#$line_no: $line";
+ }
+ return ( $ans, $line );
+}
+
+################################################################## Init
+GetOptions(
+ 'config=s' => \$config,
+ 'idcols=s' => \$id_cols,
+ 'create-headers' => \$create_headers,
+ 'use-headers=s' => \$headers_file,
+ 'fix' => \$fix,
+ 'nosave' => \$nosave,
+ 'apply' => \$apply,
+ 'pad' => \$pad,
+ 'truncate' => \$truncate,
+ 'backslash' => \$backslash,
+ 'debug' => \$debug,
+ 'help|?' => \$help
+);
+if ($help || ((@ARGV == 0) && (-t STDIN))) {
+ die "\n\t$0 [--config <CONFIG>] [--idcols <idx1,idx2,...>] [--fix] [--apply] [--pad] [--truncate] <FILE>\n\n"
+ . "\tExpects <FILE> to be a CSV-like UTF-8 encoded file.\n"
+ . "\tWill produce <FILE>.clean and <FILE>.error versions of said file.\n\n"
+ . "\t--config <CONFIG> will read the Perl file <CONFIG> for settings information. See 'Example Config' below\n\n"
+ . "\t--create-headers will generate headers like so: col1, col2, col3, etc.\n"
+ . "\t--use-headers <HFILE> will generate headers based on the specified <HFILE>, which must contain one column header per line.\n"
+ . "\t(if neither --create-headers nor --use-headers are specified, then the first line in <FILE> is assumed to contain the column headers)\n\n"
+ . "\t--fix will prompt for whether and how to fix broken records, and save those fixes in <FILE>.fixes\n"
+ . "\t--idcols <idx1,idx2,...> takes a comma-separated list of column indexes (starting with 0) to use as matchpoint suggestions for fixes\n"
+ . "\t--nosave will prevent new fixes from being saved in <FILE>.fixes\n"
+ . "\t--apply will apply previously recorded fixes from <FILE>.fixes\n\n"
+ . "\t--pad will fill in missing columns at the end if needed for otherwise unbroken records\n"
+ . "\t--truncate will strip extra columns from the end if needed for otherwise unbroken records\n"
+ . "\t--backslash will convert backslashes into forward slashes\n\n"
+ . "\t Example Config:\n\n"
+ . "\t\t\$CSV_options{quote_char} = '\"';\n"
+ . "\t\t\$CSV_options{escape_char} = '\"';\n"
+ . "\t\t\$CSV_options{sep_char} = ',';\n"
+ . "\t\t\$CSV_options{eol} = \$\\;\n"
+ . "\t\t\$CSV_options{always_quote} = 0;\n"
+ . "\t\t\$CSV_options{quote_space} = 1;\n"
+ . "\t\t\$CSV_options{quote_null} = 1;\n"
+ . "\t\t\$CSV_options{quote_binary} = 1;\n"
+ . "\t\t\$CSV_options{binary} = 0;\n"
+ . "\t\t\$CSV_options{decode_utf8} = 1;\n"
+ . "\t\t\$CSV_options{keep_meta_info} = 0;\n"
+ . "\t\t\$CSV_options{allow_loose_quotes} = 0;\n"
+ . "\t\t\$CSV_options{allow_loose_escapes} = 0;\n"
+ . "\t\t\$CSV_options{allow_unquoted_escape} = 0;\n"
+ . "\t\t\$CSV_options{allow_whitespace} = 0;\n"
+ . "\t\t\$CSV_options{blank_is_undef} = 0;\n"
+ . "\t\t\$CSV_options{empty_is_undef} = 0;\n"
+ . "\t\t\$CSV_options{verbatim} = 0;\n"
+ . "\n\n";
+}
+if (! -e $ARGV[0]) {
+ die "$ARGV[0] does not exist\n";
+}
+if ($config && ! -e $config) {
+ die "$config does not exist\n";
+}
+if ($apply && -e $ARGV[0] . '.fixes') {
+ $fixes = retrieve($ARGV[0] . '.fixes');
+}
+
+################################################################## CSV Setup
+$CSV_options{sep_char} = get_separator( path => $ARGV[0], lucky => 1 );
+if ($config && -e $config) {
+ do $config;
+}
+$csv = Text::CSV_XS->new(\%CSV_options);
+$csv->callbacks(
+ error => sub {
+ my ($err, $msg, $pos, $recno) = @_;
+ return if ($err == 2012);
+ $line_numbers_for_lines_with_errors{$line_no} = 1;
+ print "\nline#$line_no * $err : $msg -> (pos#$pos,rec#$recno)\n";
+ if ($csv->error_input) {
+ print $csv->error_input;
+ print "-" x ($pos - 1);
+ print "^\n";
+ }
+ $csv->SetDiag(0);
+ return;
+ }
+);
+
+################################################################## Reading
+
+if ($headers_file) {
+ print "_.,-~= reading $headers_file\n";
+ open my $hfile, "<:encoding(utf8)", $headers_file or die "$headers_file: $!";
+ while (my $line = <$hfile>) {
+ chomp $line;
+ $line =~ s/\s+$//;
+ $line =~ s/^\s+//;
+ push @headers, $line;
+ }
+ close $hfile;
+ $expected_column_count = scalar(@headers);
+ print "Expected column count set to $expected_column_count based on headers.\n";
+}
+
+print "_.,-~= reading $ARGV[0]\n";
+open my $in, "<:encoding(utf8)", $ARGV[0] or die "$ARGV[0]: $!";
+$line_no = 1;
+while (my $line = <$in>) {
+ print ">>> main loop (#$line_no): $line" if $debug;
+ if ($backslash) {
+ $line = convert_backslashes($line);
+ }
+ if ($apply) {
+ $line = apply_line_fixes($line);
+ }
+ if ($csv->parse($line)) {
+ my @columns = $csv->fields();
+ if (! $expected_column_count) {
+ $expected_column_count = scalar(@columns);
+ print "Expected column count set to $expected_column_count based on first row.\n";
+ for (my $i = 0; $i < scalar(@columns) ; $i++) {
+ if ($create_headers) {
+ push @headers, "col" . ($i+1);
+ } else {
+ push @headers, $columns[$i];
+ }
+ }
+ }
+ if (defined $line_numbers_for_lines_with_errors{$line_no}) {
+ if ($fix) {
+ my $fix_status;
+ ($fix_status,$line) = manual_fix($line,\@columns,0); # Regex only
+ if ($fix_status ne 'N') {
+ delete $line_numbers_for_lines_with_errors{$line_no};
+ $fix_count++;
+ redo;
+ }
+ }
+ } else {
+ if (scalar(@columns) < $expected_column_count) {
+ if ($apply) {
+ my $new_line = apply_insert_fixes($line,\@columns);
+ if ($line ne $new_line) {
+ $line = $new_line;
+ redo;
+ }
+ }
+ if ($pad) {
+ $pad_count++;
+ print "\nline#$line_no>> padding line, from " . scalar(@columns) . " columns ";
+ my $col_count = scalar(@columns);
+ for (my $i = 0; $i < $expected_column_count - $col_count; $i++) {
+ push @columns, '#pad#';
+ }
+ print "to " . scalar(@columns) . " columns.\n";
+ eval {
+ print "before: " . format_for_display($line);
+ $line = combine_cols(\@columns);
+ print " after: " . format_for_display($line);
+ };
+ if ($@) {
+ print "padding error:\n";
+ die $@;
+ }
+ redo;
+ }
+ }
+ if (scalar(@columns) > $expected_column_count) {
+ if ($apply) {
+ my $new_line = apply_delete_fixes($line,\@columns);
+ if ($line ne $new_line) {
+ $line = $new_line;
+ redo;
+ }
+ $new_line = apply_join_fixes($line,\@columns);
+ if ($line ne $new_line) {
+ $line = $new_line;
+ redo;
+ }
+ }
+ if ($truncate) {
+ $trunc_count++;
+ print "\nline#$line_no>> truncating line, from " . scalar(@columns) . " columns ";
+ splice @columns, $expected_column_count;
+ print "to " . scalar(@columns) . " columns.\n";
+ eval {
+ print "before: " . format_for_display($line);
+ $line = combine_cols(\@columns);
+ print " after: " . format_for_display($line);
+ };
+ if ($@) {
+ print "truncating error:\n";
+ die $@;
+ }
+ redo;
+ }
+ }
+ if (scalar(@columns) != $expected_column_count) {
+ # so broken, but parseable, and thus not handled by the error callback
+ print "\nline#$line_no * Expected $expected_column_count columns but found " . scalar(@columns) . "\n$line";
+ print "-" x length($line) . "\n";
+ $line_numbers_for_lines_with_errors{$line_no} = 1;
+ if ($fix) {
+ my $fix_status;
+ ($fix_status,$line) = manual_fix($line,\@columns,1); # Insert/Delete allowed
+ if ($fix_status ne 'N') {
+ delete $line_numbers_for_lines_with_errors{$line_no};
+ $fix_count++;
+ redo;
+ }
+ }
+ }
+ }
+ if (defined $line_numbers_for_lines_with_errors{$line_no}) {
+ print "\tIncrementing errors with line# $line_no\n" if $debug;
+ push @lines_with_errors, $line;
+ } else {
+ print "\tIncrementing clean with line# $line_no\n" if $debug;
+ push @parsed_rows, \@columns;
+ }
+ $line_no++;
+ } else {
+ die "Parsing error:\n" . $csv->error_input . "\n";
+ }
+}
+close $in;
+print "_.,-~= read " . ($line_no-1) . " records ";
+print "(" . scalar(@lines_with_errors) . " with errors, $pad_count auto-padded, $trunc_count auto-truncated, $backslash_count backslashes converted, $fix_count manual fixes)\n";
+
+
+################################################################## Writing good CSV
+
+print "_.,-~= writing $ARGV[0].clean\n";
+open my $out, ">:encoding(utf8)", "$ARGV[0].clean" or die "$ARGV[0].clean: $!";
+$line_no = 1;
+$actual_count = 0;
+if ($create_headers || $headers_file) {
+ unshift @parsed_rows, \@headers;
+}
+foreach my $row (@parsed_rows) {
+ eval {
+ $line = combine_cols($row);
+ print $out $line;
+ };
+ if ($@) {
+ print "error:\n";
+ die $@;
+ }
+ $actual_count++;
+}
+close $out;
+print "_.,-~= wrote " . ($actual_count) . " records\n";
+
+
+################################################################## Writing broken CSV
+
+print "_.,-~= writing $ARGV[0].error\n";
+open my $out2, ">:encoding(utf8)", "$ARGV[0].error" or die "$ARGV[0].error: $!";
+foreach my $row (@lines_with_errors) {
+ print $out2 $row;
+}
+close $out2;
+print "_.,-~= wrote " . (scalar @lines_with_errors) . " records\n";
+
+
+################################################################## .no_headers version
+
+print "_.,-~= creating $ARGV[0].clean.no_headers\n";
+
+print `tail -n +2 $ARGV[0].clean > $ARGV[0].clean.no_headers`;
+
+################################################################## Finished
+
+print "_.,-~= finished\n";
--- /dev/null
+Text::CSV::Separator
+Switch
--- /dev/null
+#!/usr/bin/perl -w
+use Getopt::Long;
+use Text::CSV::Auto;
+use Data::Dumper;
+use DBI;
+use File::Basename;
+
+my $dbh;
+my $cfg;
+my $csv_config;
+
+sub init {
+ our %config;
+ do '/openils/conf/offline-config.pl';
+ $dbh = DBI->connect( $config{dsn}, $config{usr}, $config{pw} ) or die $DBI::errstr;
+ $cfg = {
+ schema => 'm_foo',
+ auto_options => {
+ }
+ };
+ our %CSV_options = (
+ binary => 1,
+ auto_diag => 1,
+ diag_verbose => 1,
+ );
+ $cfg->{auto_options}->{csv_options} = \%CSV_options;
+
+ GetOptions(
+ 'config=s' => \$csv_config,
+ 'no-legacy-prefix' => \($cfg->{no_legacy_prefix}),
+ 'use-no-headers-file' => \($cfg->{use_no_headers_file}),
+ 'add-x-migrate' => \($cfg->{add_x_migrate}),
+ 'outfile=s' => \($cfg->{outfile}),
+ 'schema=s' => \($cfg->{schema}),
+ 'parent=s' => \($cfg->{parent}),
+ 'help|?' => \$help
+ );
+ if ($help || ((@ARGV == 0) && (-t STDIN))) {
+ die qq^\n\t$0 [--config <CONFIG>] [--add-x-migrate] [--no-legacy-prefix] [--schema <schema>] [--parent <base table>] [--outfile <file to create>] <"clean" file from clean_csv script>\n\n^;
+ }
+ if ($csv_config && ! -e $csv_config) {
+ die "$csv_config does not exist\n";
+ }
+ if ($csv_config && -e $csv_config) {
+ do $csv_config;
+ }
+ if (! -e $ARGV[0]) {
+ die "$ARGV[0] does not exist\n";
+ }
+}
+
+sub write_sql_sample {
+ my $cfg = shift;
+ my $info = shift;
+ my $fn = $cfg->{outfile} || $cfg->{auto_options}->{file} . '.sql';
+
+ print "\twriting $fn\n";
+ local *SQL;
+ open SQL, ">$fn";
+ print SQL "-- $cfg->{auto_options}->{file}\n/*\n";
+ open IN, $cfg->{auto_options}->{file};
+ foreach (1..5) {
+ my $line = <IN>;
+ print SQL $line;
+ }
+ close IN;
+ print SQL "*/\n";
+ return *SQL;
+}
+
+sub write_sql_table {
+ my $sql = shift;
+ my $cfg = shift;
+ my $info = shift;
+ my $fn = $cfg->{auto_options}->{file};
+ my @indices = ();
+
+ print "\twriting table definition\n";
+ if ($cfg->{parent}) {
+ $cfg->{table_name} = $cfg->{parent} . '_legacy';
+ } else {
+ $cfg->{table_name} = lc(basename($fn)); $cfg->{table_name} =~ s/[\-\. ]/_/g;
+ }
+ print $sql "DROP TABLE IF EXISTS $cfg->{schema}.$cfg->{table_name};\n";
+ print $sql "CREATE UNLOGGED TABLE $cfg->{schema}.$cfg->{table_name} (\n";
+ my $idx = 0;
+ if ($cfg->{add_x_migrate}) {
+ print $sql " x_migrate BOOLEAN\n";
+ $idx++;
+ push @indices, 'x_migrate';
+ }
+ foreach my $column (@{ $info }) {
+ my $cn = $column->{'header'};
+ if ($cn =~ /^x_/) {
+ push @indices, $cn;
+ }
+ my $col_info = Dumper($column);
+ $col_info =~ s/^\$VAR1 = //;
+ print $sql " " . ($idx++ ? ',' : ' ');
+ print $sql "l_" unless $cfg->{no_legacy_prefix} or $column->{'header'} =~ /^x_/ or $column->{'header'} =~ /^l_/;
+ print $sql "$cn " . ($cn eq 'x_eg_bib_id' ? 'BIGINT' : 'TEXT');
+ print $sql " /*\n $col_info */\n";
+ }
+ if ($cfg->{parent}) {
+ print $sql ') INHERITS (' . $cfg->{schema} . '.' . $cfg->{parent} . ");\n";
+ } else {
+ print $sql ");\n";
+ }
+ foreach my $cn (@indices) {
+ print $sql "CREATE INDEX ON $cfg->{schema}.$cfg->{table_name} ($cn);\n";
+ }
+}
+
+sub write_sql_loader {
+ my $sql = shift;
+ my $cfg = shift;
+ my $auto = shift;
+ my $info = shift;
+ my $fn = $cfg->{auto_options}->{file} . ($cfg->{use_no_headers_file} ? '.no_headers' : '');
+
+ print "\twriting copy statement\n";
+ print $sql "\n\\COPY $cfg->{schema}.$cfg->{table_name} (";
+ my $idx = 0;
+ foreach my $column (@{ $info }) {
+ print $sql ($idx++ ? ',' : '');
+ print $sql "l_" unless $cfg->{no_legacy_prefix} or $column->{'header'} =~ /^x_/ or $column->{'header'} =~ /^l_/;
+ print $sql $column->{'header'};
+ }
+ print $sql ") FROM '$fn'";
+ if ($auto->csv->sep_char eq chr(9) && ! defined $auto->csv->quote_char && ! defined $auto->csv->escape_char) {
+ # true .tsv, don't treat as csv
+ } elsif ($auto->csv->sep_char eq chr(9)) {
+ # probably good enough .tsv, don't treat as csv
+ } else {
+ print $sql " WITH csv " . ($cfg->{use_no_headers_file} ? "" : "header");
+ print $sql " delimiter " . $dbh->quote( $auto->csv->sep_char ) unless $dbh->quote( $auto->csv->sep_char ) eq 'NULL';
+ print $sql " quote " . $dbh->quote( $auto->csv->quote_char ) unless $dbh->quote( $auto->csv->quote_char ) eq 'NULL';
+ print $sql " escape " . $dbh->quote( $auto->csv->escape_char ) unless $dbh->quote( $auto->csv->escape_char ) eq 'NULL';
+ }
+ print $sql "\n";
+}
+
+sub main {
+ init();
+ foreach my $fn (@ARGV) {
+ print "processing $fn\n";
+ $cfg->{auto_options}->{file} = $fn;
+ my $auto = Text::CSV::Auto->new($cfg->{auto_options});
+
+ my $info = $auto->analyze();
+ my $sql = write_sql_sample($cfg,$info);
+ write_sql_table($sql,$cfg,$info);
+ write_sql_loader($sql,$cfg,$auto,$info);
+ close $sql;
+
+ print "\tdone.\n";
+ }
+}
+
+main();
+
--- /dev/null
+Text::CSV::Auto
--- /dev/null
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long;
+
+my $help;
+my $headers = 1;
+my $sourcecolumn = 1;
+my $sourcetype = 'partial';
+my $first_headers;
+my $use_tab = 1;
+my $use_comma = 0;
+
+GetOptions(
+ 'headers!' => \$headers,
+ 'source!' => \$sourcecolumn,
+ 'source-type=s' => \$sourcetype,
+ 'use-tab' => \$use_tab,
+ 'use-comma' => \$use_comma,
+ 'help|?' => \$help
+);
+if ($use_comma) {
+ $use_tab = 0;
+}
+if ($help || @ARGV == 0 || ($sourcetype ne 'partial' && $sourcetype ne 'full')) {
+ print "$0 <--headers|--noheaders> <--source|--nosource> <--source-type=full|--source-type=partial> <--use-tab|--use-comma> [file1] [file2] <fileN...>\n";
+ exit 0;
+}
+
+sub munged_source {
+ my $fn = shift;
+ my $s = $fn;
+ if ($sourcetype eq 'partial') {
+ my @f = split(/\//, $fn);
+ $s = $f[0];
+ }
+ return "$s" . ($use_tab ? "\t" : ',');
+}
+
+sub cat_file {
+ my $fn = shift;
+ open FILE, "$fn";
+ if ($headers) {
+ if ($fn ne $ARGV[0]) {
+ my $check = <FILE>; # check and throw away headers for subsequent files
+ if ($check ne $first_headers) {
+ print STDERR "Mismatched headers between $ARGV[0] and $fn\n";
+ print STDERR "$first_headers\n$check\n";
+ exit 1;
+ }
+ } else {
+ print "x_source" . ($use_tab ? "\t" : ',') if $sourcecolumn; # potential column header
+ $first_headers = <FILE>;
+ print $first_headers;
+ }
+ }
+ while (my $line = <FILE>) {
+ print munged_source($fn) if $sourcecolumn;
+ print $line;
+ }
+ close FILE;
+}
+
+my @files = @ARGV;
+foreach my $file (@files) {
+ cat_file($file);
+}
+
+
+
+
--- /dev/null
+Barcode
+unknown
+Last name
+First name
+Middle name
+Expiration date
+unknown
+Birth date
+Gender
+Patron Type
+Status
+User defined 1
+User defined 2
+User defined 3
+User defined 4
+Address line 1
+Address line 2
+City
+State
+Zip
+Email 1
+Phone 1
+Phone 2
+Secondary address line 1
+Secondary address line 2
+Secondary city
+Secondary state
+Secondary zip
+Email 2
+Secondary phone 1
+Secondary phone 2
+unknown
+Grade level
+unknown
+Nickname
+Acceptable use policy on file
+unknown
--- /dev/null
+x_egid
+x_hseq
+l_value
--- /dev/null
+$CSV_options{quote_char} = '"';
+$CSV_options{escape_char} = '"';
+$CSV_options{sep_char} = ',';
--- /dev/null
+price 852 9
+library 852 a
+location 852 b
+call_number 852 h
+cn_item_number 852 i
+cn_prefix 852 k
+cn_suffix 852 m
+barcode 852 p
+copy_number 852 t
+item_note 852 x m:multi
+extra1 926 a
+extra2 926 b
--- /dev/null
+t949a 949 a
+t949b 949 b
+t949c 949 c
+t949d 949 d
+t949e 949 e
+t949f 949 f
+t949g 949 g
+t949h 949 h
+t949i 949 i
+t949j 949 j
+t949k 949 k
+t949n 949 n
+t949o 949 o
+t949p 949 p
+t949q 949 q
+t949r 949 r
+t949s 949 s
+t949t 949 t
+t949u 949 u
+t949v 949 v
+t949w 949 w
+t949x 949 x
+t949y 949 y
+t949z 949 z
--- /dev/null
+$CSV_options{quote_char} = undef;
+$CSV_options{escape_char} = undef;
+$CSV_options{sep_char} = chr(9);
--- /dev/null
+#!/usr/bin/perl
+
+my $delimiter = "\t";
+my $colcount = 11;
+
+my $running_field_count = 0;
+
+while (my $line = <>) {
+ chomp $line;
+ print $line;
+ my @f = split /$delimiter/, $line;
+ if (scalar(@f) != 1) {
+ $running_field_count += scalar(@f);
+ }
+ if ($running_field_count >= $colcount) {
+ $running_field_count = 0;
+ print "\n";
+ }
+}
+
--- /dev/null
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+# Hello, this program is written in Python - http://python.org
+programname = 'html2csv - version 2002-09-20 - http://sebsauvage.net'
+
+import sys, getopt, os.path, glob, HTMLParser, re
+
+try: import psyco ; psyco.jit() # If present, use psyco to accelerate the program
+except: pass
+
+def usage(progname):
+ ''' Display program usage. '''
+ progname = os.path.split(progname)[1]
+ if os.path.splitext(progname)[1] in ['.py','.pyc']: progname = 'python '+progname
+ return '''%s
+A coarse HTML tables to CSV (Comma-Separated Values) converter.
+
+Syntax : %s source.html
+
+Arguments : source.html is the HTML file you want to convert to CSV.
+ By default, the file will be converted to csv with the same
+ name and the csv extension (source.html -> source.csv)
+ You can use * and ?.
+
+Examples : %s mypage.html
+ : %s *.html
+
+This program is public domain.
+Author : Sebastien SAUVAGE <sebsauvage at sebsauvage dot net>
+ http://sebsauvage.net
+''' % (programname, progname, progname, progname)
+
+class html2csv(HTMLParser.HTMLParser):
+ ''' A basic parser which converts HTML tables into CSV.
+ Feed HTML with feed(). Get CSV with getCSV(). (See example below.)
+ All tables in HTML will be converted to CSV (in the order they occur
+ in the HTML file).
+ You can process very large HTML files by feeding this class with chunks
+ of html while getting chunks of CSV by calling getCSV().
+ Should handle badly formated html (missing <tr>, </tr>, </td>,
+ extraneous </td>, </tr>...).
+ This parser uses HTMLParser from the HTMLParser module,
+ not HTMLParser from the htmllib module.
+ Example: parser = html2csv()
+ parser.feed( open('mypage.html','rb').read() )
+ open('mytables.csv','w+b').write( parser.getCSV() )
+ This class is public domain.
+ Author: Sébastien SAUVAGE <sebsauvage at sebsauvage dot net>
+ http://sebsauvage.net
+ Versions:
+ 2002-09-19 : - First version
+ 2002-09-20 : - now uses HTMLParser.HTMLParser instead of htmllib.HTMLParser.
+ - now parses command-line.
+ To do:
+ - handle <PRE> tags
+ - convert html entities (&name; and &#ref;) to Ascii.
+ '''
+ def __init__(self):
+ HTMLParser.HTMLParser.__init__(self)
+ self.CSV = '' # The CSV data
+ self.CSVrow = '' # The current CSV row beeing constructed from HTML
+ self.inTD = 0 # Used to track if we are inside or outside a <TD>...</TD> tag.
+ self.inTR = 0 # Used to track if we are inside or outside a <TR>...</TR> tag.
+ self.re_multiplespaces = re.compile('\s+') # regular expression used to remove spaces in excess
+ self.rowCount = 0 # CSV output line counter.
+ def handle_starttag(self, tag, attrs):
+ if tag == 'tr': self.start_tr()
+ elif tag == 'td': self.start_td()
+ def handle_endtag(self, tag):
+ if tag == 'tr': self.end_tr()
+ elif tag == 'td': self.end_td()
+ def start_tr(self):
+ if self.inTR: self.end_tr() # <TR> implies </TR>
+ self.inTR = 1
+ def end_tr(self):
+ if self.inTD: self.end_td() # </TR> implies </TD>
+ self.inTR = 0
+ if len(self.CSVrow) > 0:
+ self.CSV += self.CSVrow[:-1]
+ self.CSVrow = ''
+ self.CSV += '\n'
+ self.rowCount += 1
+ def start_td(self):
+ if not self.inTR: self.start_tr() # <TD> implies <TR>
+ self.CSVrow += '"'
+ self.inTD = 1
+ def end_td(self):
+ if self.inTD:
+ self.CSVrow += '",'
+ self.inTD = 0
+ def handle_data(self, data):
+ if self.inTD:
+ self.CSVrow += self.re_multiplespaces.sub(' ',data.replace('\t',' ').replace('\n','').replace('\r','').replace('"','""'))
+ def getCSV(self,purge=False):
+ ''' Get output CSV.
+ If purge is true, getCSV() will return all remaining data,
+ even if <td> or <tr> are not properly closed.
+ (You would typically call getCSV with purge=True when you do not have
+ any more HTML to feed and you suspect dirty HTML (unclosed tags). '''
+ if purge and self.inTR: self.end_tr() # This will also end_td and append last CSV row to output CSV.
+ dataout = self.CSV[:]
+ self.CSV = ''
+ return dataout
+
+
+if __name__ == "__main__":
+ try: # Put getopt in place for future usage.
+ opts, args = getopt.getopt(sys.argv[1:],None)
+ except getopt.GetoptError:
+ print usage(sys.argv[0]) # print help information and exit:
+ sys.exit(2)
+ if len(args) == 0:
+ print usage(sys.argv[0]) # print help information and exit:
+ sys.exit(2)
+ print programname
+ html_files = glob.glob(args[0])
+ for htmlfilename in html_files:
+ outputfilename = os.path.splitext(htmlfilename)[0]+'.csv'
+ parser = html2csv()
+ print 'Reading %s, writing %s...' % (htmlfilename, outputfilename)
+ try:
+ htmlfile = open(htmlfilename, 'rb')
+ csvfile = open( outputfilename, 'w+b')
+ data = htmlfile.read(8192)
+ while data:
+ parser.feed( data )
+ csvfile.write( parser.getCSV() )
+ sys.stdout.write('%d CSV rows written.\r' % parser.rowCount)
+ data = htmlfile.read(8192)
+ csvfile.write( parser.getCSV(True) )
+ csvfile.close()
+ htmlfile.close()
+ except:
+ print 'Error converting %s ' % htmlfilename
+ try: htmlfile.close()
+ except: pass
+ try: csvfile.close()
+ except: pass
+ print 'All done. '
--- /dev/null
+#!/usr/bin/python
+# -*- coding: iso-8859-1 -*-
+# Hello, this program is written in Python - http://python.org
+programname = 'html2tsv - version 2002-09-20 - http://sebsauvage.net'
+
+import sys, getopt, os.path, glob, HTMLParser, re
+
+try: import psyco ; psyco.jit() # If present, use psyco to accelerate the program
+except: pass
+
+def usage(progname):
+ ''' Display program usage. '''
+ progname = os.path.split(progname)[1]
+ if os.path.splitext(progname)[1] in ['.py','.pyc']: progname = 'python '+progname
+ return '''%s
+A coarse HTML tables to TSV (Tab-Separated Values) converter.
+
+Syntax : %s source.html
+
+Arguments : source.html is the HTML file you want to convert to TSV.
+ By default, the file will be converted to tsv with the same
+ name and the tsv extension (source.html -> source.tsv)
+ You can use * and ?.
+
+Examples : %s mypage.html
+ : %s *.html
+
+This program is public domain.
+Author : Sebastien SAUVAGE <sebsauvage at sebsauvage dot net>
+ http://sebsauvage.net
+''' % (programname, progname, progname, progname)
+
+class html2tsv(HTMLParser.HTMLParser):
+ ''' A basic parser which converts HTML tables into TSV.
+ Feed HTML with feed(). Get TSV with getTSV(). (See example below.)
+ All tables in HTML will be converted to TSV (in the order they occur
+ in the HTML file).
+ You can process very large HTML files by feeding this class with chunks
+ of html while getting chunks of TSV by calling getTSV().
+ Should handle badly formated html (missing <tr>, </tr>, </td>,
+ extraneous </td>, </tr>...).
+ This parser uses HTMLParser from the HTMLParser module,
+ not HTMLParser from the htmllib module.
+ Example: parser = html2tsv()
+ parser.feed( open('mypage.html','rb').read() )
+ open('mytables.tsv','w+b').write( parser.getTSV() )
+ This class is public domain.
+ Author: Sébastien SAUVAGE <sebsauvage at sebsauvage dot net>
+ http://sebsauvage.net
+ Versions:
+ 2002-09-19 : - First version
+ 2002-09-20 : - now uses HTMLParser.HTMLParser instead of htmllib.HTMLParser.
+ - now parses command-line.
+ To do:
+ - handle <PRE> tags
+ - convert html entities (&name; and &#ref;) to Ascii.
+ '''
+ def __init__(self):
+ HTMLParser.HTMLParser.__init__(self)
+ self.TSV = '' # The TSV data
+ self.TSVrow = '' # The current TSV row beeing constructed from HTML
+ self.inTD = 0 # Used to track if we are inside or outside a <TD>...</TD> tag.
+ self.inTR = 0 # Used to track if we are inside or outside a <TR>...</TR> tag.
+ self.re_multiplespaces = re.compile('\s+') # regular expression used to remove spaces in excess
+ self.rowCount = 0 # TSV output line counter.
+ def handle_starttag(self, tag, attrs):
+ if tag == 'tr': self.start_tr()
+ elif tag == 'td': self.start_td()
+ def handle_endtag(self, tag):
+ if tag == 'tr': self.end_tr()
+ elif tag == 'td': self.end_td()
+ def start_tr(self):
+ if self.inTR: self.end_tr() # <TR> implies </TR>
+ self.inTR = 1
+ def end_tr(self):
+ if self.inTD: self.end_td() # </TR> implies </TD>
+ self.inTR = 0
+ if len(self.TSVrow) > 0:
+ self.TSV += self.TSVrow[:-1]
+ self.TSVrow = ''
+ self.TSV += '\n'
+ self.rowCount += 1
+ def start_td(self):
+ if not self.inTR: self.start_tr() # <TD> implies <TR>
+ self.TSVrow += ''
+ self.inTD = 1
+ def end_td(self):
+ if self.inTD:
+ self.TSVrow += '\t'
+ self.inTD = 0
+ def handle_data(self, data):
+ if self.inTD:
+ self.TSVrow += self.re_multiplespaces.sub(' ',data.replace('\t',' ').replace('\n','').replace('\r','').replace('"','""'))
+ def getTSV(self,purge=False):
+ ''' Get output TSV.
+ If purge is true, getTSV() will return all remaining data,
+ even if <td> or <tr> are not properly closed.
+ (You would typically call getTSV with purge=True when you do not have
+ any more HTML to feed and you suspect dirty HTML (unclosed tags). '''
+ if purge and self.inTR: self.end_tr() # This will also end_td and append last TSV row to output TSV.
+ dataout = self.TSV[:]
+ self.TSV = ''
+ return dataout
+
+
+if __name__ == "__main__":
+ try: # Put getopt in place for future usage.
+ opts, args = getopt.getopt(sys.argv[1:],None)
+ except getopt.GetoptError:
+ print usage(sys.argv[0]) # print help information and exit:
+ sys.exit(2)
+ if len(args) == 0:
+ print usage(sys.argv[0]) # print help information and exit:
+ sys.exit(2)
+ print programname
+ html_files = glob.glob(args[0])
+ for htmlfilename in html_files:
+ outputfilename = os.path.splitext(htmlfilename)[0]+'.tsv'
+ parser = html2tsv()
+ print 'Reading %s, writing %s...' % (htmlfilename, outputfilename)
+ try:
+ htmlfile = open(htmlfilename, 'rb')
+ tsvfile = open( outputfilename, 'w+b')
+ data = htmlfile.read(8192)
+ while data:
+ parser.feed( data )
+ tsvfile.write( parser.getTSV() )
+ sys.stdout.write('%d TSV rows written.\r' % parser.rowCount)
+ data = htmlfile.read(8192)
+ tsvfile.write( parser.getTSV(True) )
+ tsvfile.close()
+ htmlfile.close()
+ except:
+ print 'Error converting %s ' % htmlfilename
+ try: htmlfile.close()
+ except: pass
+ try: tsvfile.close()
+ except: pass
+ print 'All done. '
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+join_lines - program for joining adjacent lines from stdin or one or more files
+
+=head1 SYNOPSIS
+
+B<join_lines> [options...] [text files...]
+
+=head1 DESCRIPTION
+
+B<join_lines> is used to join adjacent lines from stdin or one or more files,
+trimming trailing line feeds and carriage returns, and sending the output to
+stdout. Arguments are used to define the criteria and exact behavior.
+
+=head1 OVERVIEW
+
+B<join_lines> understands the following optional arguments:
+
+=over 15
+
+=item --help
+
+Show this documentation.
+
+=item --delimiter=<d>
+
+B<join_lines> will use the specified delimiter <d> to separate joined lines. The
+default is to use no delmiter when joining lines.
+
+=item --lines=<n>
+
+This will cause B<join_lines> to join every <n> lines into one line.
+
+=item --record-separator=<s>
+
+This will cause B<join_lines> to immediately start a new line when it encounters
+a line solely containing <s>. That line itself will not otherwise be used. This
+argument may be used in conjunction with --lines, but probably should not be. :)
+
+=item --record-separator-empty-line
+
+This works like --record-separator, but is a way to define an "empty" line as
+being the record separator.
+
+=back
+
+=cut
+###############################################################################
+
+use strict;
+use Pod::Usage;
+use Getopt::Long;
+
+my $help;
+my $delimiter;
+my $linecount;
+my $record_separator;
+my $record_separator_empty_line;
+
+GetOptions(
+ 'delimiter=s' => \$delimiter,
+ 'lines=s' => \$linecount,
+ 'record-separator=s' => \$record_separator,
+ 'record-separator-empty-line' => \$record_separator_empty_line,
+ 'help|?' => \$help
+);
+pod2usage(-verbose => 2) if $help;
+
+my $count = 0;
+my @lines = ();
+
+sub join_lines {
+ print join($delimiter || '', @lines) . "\n";
+ @lines = ();
+ $count = 0;
+}
+
+while (my $line = <>) {
+ $count++;
+ $line =~ s/[\r\n]+$//g;
+ if (defined $record_separator_empty_line && $line eq '') {
+ join_lines();
+ } elsif (defined $record_separator && $line eq $record_separator) {
+ join_lines();
+ } elsif (defined $linecount && $count == $linecount) {
+ push @lines, $line;
+ join_lines();
+ } elsif (! defined $linecount && ! defined $record_separator && ! defined $record_separator_empty_line) {
+ print "$line\n"; # passthru when given no arguments
+ } else {
+ push @lines, $line;
+ }
+}
+if (scalar(@lines) > 0) {
+ print join($delimiter || '', @lines) . "\n";
+}
--- /dev/null
+#!/usr/bin/perl -w
+# so we're wanting our lines to end in <13><10>, ideally, and this is catching lines where isolated <10>'s cause a premature linebreak
+while (my $line = <>) {
+ $line =~ s/([^\x0D])\x0A$/$1/;
+ print $line;
+}
--- /dev/null
+#!/usr/bin/perl -w
+###############################################################################
+=pod
+
+=head1 NAME
+
+join_lines_if_short - program for joining some adjacent lines from stdin or one
+or more files
+
+=head1 SYNOPSIS
+
+B<join_lines_if_short> [options...] [text files...]
+
+=head1 DESCRIPTION
+
+B<join_lines_if_short> is used to join adjacent lines from stdin or one or more
+files, trimming trailing line feeds and carriage returns, and sending the output
+to stdout. Lines are only joined if the first line is short the expected number
+of columns.
+
+=head1 OVERVIEW
+
+B<join_lines_if_short> understands the following optional arguments:
+
+=over 15
+
+=item --help
+
+Show this documentation.
+
+=item --delimiter=<d>
+
+B<join_lines_if_short> will use the specified delimiter <d> for determining the
+column count for each line. The default is to assume tab as the delimiter.
+
+=item --join_delimiter=<d>
+
+B<join_lines_if_short> will use the specified delimiter <d> when joining lines.
+The default is to use no delimiter.
+
+=item --columns=<n>
+
+B<join_lines_if_short> will expect each line to contain <n> columns. If a line
+has fewer than <n> columns, then this is the trigger for joining that line with
+the next line. The new line will be reconsidered and potentially joined with
+the next line and so on.
+
+=back
+
+=cut
+###############################################################################
+
+use strict;
+use Pod::Usage;
+use Getopt::Long;
+
+my $help;
+my $delimiter = "\t";
+my $join_delimiter = "";
+my $colcount;
+
+GetOptions(
+ 'delimiter=s' => \$delimiter,
+ 'join_delimiter=s' => \$join_delimiter,
+ 'columns=s' => \$colcount,
+ 'help|?' => \$help
+);
+pod2usage(-verbose => 2) if $help || ! defined $colcount;
+
+my $line_buffer = '';
+while (my $line = <>) {
+ chomp $line;
+ if ($line_buffer eq '') {
+ $line_buffer = $line;
+ } else {
+ $line_buffer = "$line_buffer$join_delimiter$line";
+ }
+ my @f = split /$delimiter/, $line_buffer;
+ if (scalar(@f) >= $colcount) {
+ $line_buffer =~ s/\x0D//g; # strip embedded carriage returns
+ print "$line_buffer\n";
+ $line_buffer = '';
+ }
+}
+if ($line_buffer ne '') {
+ print "$line_buffer\n";
+}
--- /dev/null
+#!/bin/bash
+START=$2
+if [ ! $START ]; then
+ START=1;
+fi
+tail -n +$START $1 | head -1 > $1.headers
+echo "wrote $1.headers (line $START)"
+tail -n +`expr $START + 1` $1 > $1.no_headers
+echo "wrote $1.no_headers (after line $START)"
+if [ "$START" -gt "1" ]; then
+ tail -n +$START $1 > $1.with_headers
+ echo "wrote $1.with_headers (starting with line $START)"
+fi
--- /dev/null
+#!/usr/bin/perl -w
+while (my $line = <>) {
+ chomp $line;
+ $line =~ s/\x0D\x09/_/g;
+ print "$line\n";
+}