2 ###############################################################################
9 Interactive session for analyzing, flagging, and mapping legacy field data to
12 Upon exit, generate either [file].clean.map.sql or <parent table>_map.sql. The
13 SQL generated will be UPDATE's for setting the Evergreen-specific columns for a
14 given file's staging tables, and TRUNCATE's and INSERT's for auxilary tables.
15 The files will have \include hooks for pulling in additional mapping files
16 (for example, end-user mappings for circ modifiers, etc.)
24 ###############################################################################
30 HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
31 MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
37 my $mig_bin = "$FindBin::Bin/";
38 use lib "$FindBin::Bin/";
41 pod2usage(-verbose => 2) if ! $ARGV[0] || $ARGV[0] eq '--help';
43 Mig::die_if_no_env_migschema();
44 Mig::die_if_mig_tracking_table_does_not_exist();
46 my $column_filter = 1; # show all fields
47 my $file = abs_path($ARGV[0]);
49 my $tracked_file_id = Mig::check_for_tracked_file($file);
50 if ($tracked_file_id) {
51 $fdata = Mig::status_this_file($file);
53 die "File not currently tracked: $file\n";
56 my $table = $fdata->{staged_table};
58 die "No staged staged table for file: $file\n";
62 my $term = Term::ReadLine->new('mapper');
64 my $OUT = $term->OUT || \*STDOUT;
68 $prompt = "$fdata->{staged_table}: ";
69 while ( $loop && defined (my $cmd = $term->readline($prompt)) ) {
73 $term->addhistory($cmd) if $cmd =~ /\S/;
74 if ($cmd =~ /^\d+$/) {
75 my $ret = column_menu($cmd);
82 case /^(ls|\?|\.|;)$/ {
105 $loop = 0 if $cmd =~ /^q/io;
110 ###############################################################################
113 print "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
115 print "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
117 print " l) SELECT * FROM $fdata->{staged_table} LIMIT 10;\n";
118 print "f1) show all fields (default)\n";
119 print "f2) show legacy fields\n";
120 print "f3) show EG fields\n";
121 print " q) quit\n\n";
122 printf "%-36s", "Columns (* for required)";
123 printf "%-30s", "Target";
124 printf "%-30s", "Transform";
125 printf "%-30s", "First Row";
126 printf "%-30s", "Migration Note";
128 printf "%-36s", "-------";
129 printf "%-30s", "------";
130 printf "%-30s", "---------";
131 printf "%-30s", "---------";
132 printf "%-30s", "--------------";
134 my $dbh = Mig::db_connect();
135 my $sth = $dbh->prepare("
137 FROM information_schema.columns
138 WHERE table_schema = " . $dbh->quote($MIGSCHEMA) . "
139 AND table_name = " . $dbh->quote($table) . "
140 ORDER BY dtd_identifier::INTEGER ASC;
142 my $rv = $sth->execute()
143 || die "Error retrieving data from information_schema: $!";
144 my $sth2 = $dbh->prepare("
146 FROM $MIGSCHEMA.$table
149 my $rv2 = $sth2->execute()
150 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
151 my $row = $sth2->fetchrow_hashref;
153 open LESS, "|less -F";
154 @dtd_identifiers = ();
155 while (my $data = $sth->fetchrow_hashref) {
156 my $column = $data->{column_name};
157 if ($column_filter == 2 && !($column =~ /^[xl]_/)) {
160 if ($column_filter == 3 && ($column =~ /^[xl]_/)) {
163 my $cdata = status_this_column($column);
164 printf LESS $cdata->{required} ? '*' : ' ';
165 printf LESS "%3s) ", $data->{dtd_identifier};
166 push @dtd_identifiers, $data->{dtd_identifier};
167 printf LESS "%-30s", $column;
168 printf LESS "%-30s", defined $cdata->{target_table}
169 ? ( $cdata->{target_table} ne $table ? $cdata->{target_table} . '.' : '') . $cdata->{target_column}
171 printf LESS "%-30s", defined $cdata->{transform} ? $cdata->{transform} : '';
172 printf LESS "%-30s", defined $$row{$column} ? $$row{$column} : '';
173 printf LESS "%-30s", defined $cdata->{comment} ? $cdata->{comment} : '';
180 Mig::db_disconnect($dbh);
184 my $dtd_identifier = shift;
185 my $dbh = Mig::db_connect();
186 my $sth = $dbh->prepare("
188 FROM information_schema.columns
189 WHERE table_schema = " . $dbh->quote($MIGSCHEMA) . "
190 AND table_name = " . $dbh->quote($table) . "
191 AND dtd_identifier = " . $dbh->quote($dtd_identifier) . ";
193 my $rv = $sth->execute()
194 || die "Error retrieving data from information_schema: $!";
195 my $data = $sth->fetchrow_hashref;
197 Mig::db_disconnect($dbh);
199 my $column = $data->{column_name};
201 my $prompt = "$table.$column: ";
205 my $cdata = status_this_column($column);
206 print "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
208 print "\n-------------------------------------------------------------------------------------------------\n";
209 print " target: " . (
210 defined $cdata->{target_table}
211 ? ( $cdata->{target_table} ne $table ? $cdata->{target_table} . '.' : '') . $cdata->{target_column}
214 print "transform: " . (defined $cdata->{transform} ? $cdata->{transform} : '') . "\n";
215 print " comment: " . (defined $cdata->{comment} ? $cdata->{comment} : '') . "\n";
216 print "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
218 print " l) SELECT $column FROM $fdata->{staged_table} LIMIT 10;\n";
219 print " s) summarize\n" if $column ne 'x_migrate';
220 print " g) group browse\n";
221 print "g2) group browse (order by count desc)\n";
222 print " c) comment\n";
223 print " f) flag for end-user mapping\n";
224 print " t) target\n";
225 print " e) eval/transform\n";
226 print " n) next column\n";
227 print " p) prev column\n";
228 print " q) quit back to table menu\n";
234 while ( $loop && defined (my $cmd = $term->readline($prompt)) ) {
237 $term->addhistory($cmd) if $cmd =~ /\S/;
238 $loop = 0 if $cmd =~ /^q/io;
240 case /^(ls|\?|\.|;)$/ {
253 group_browse($column);
256 group_browse($column,'GROUP BY 1 ORDER BY 2 DESC');
259 if ($cmd =~ /^c\s+(.+)$/) {
260 set_comment($column,$1);
264 if ($cmd =~ /^t\s+(.+)$/) {
265 set_target($column,$1);
269 if ($cmd =~ /^e\s+(.+)$/) {
270 set_transform($column,$1);
274 my( $index )= grep { $dtd_identifiers[$_] eq $dtd_identifier } 0..$#dtd_identifiers;
275 return $dtd_identifiers[$index + 1];
278 my( $index )= grep { $dtd_identifiers[$_] eq $dtd_identifier } 0..$#dtd_identifiers;
279 return $dtd_identifiers[$index - 1];
288 my $dbh = Mig::db_connect();
293 $sth = $dbh->prepare(Mig::sql("
294 SELECT " . (defined $column ? $column : '*') . "
295 FROM $MIGSCHEMA.$table
298 $rv = $sth->execute()
299 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
301 while (@cols = $sth->fetchrow_array) {
302 print "\t" . join(',',map {defined $_ ? $_ : ''} @cols) . "\n";
313 my $distinct_value_count;
314 my $distinct_integer_value_count;
315 my $distinct_money6_value_count;
316 my $distinct_money8_value_count;
317 my $distinct_date_value_count;
318 my $distinct_timestamptz_value_count;
322 my $min_length_min_value;
325 my $max_length_max_value;
327 my $min_value_as_integer;
328 my $max_value_as_integer;
330 my $min_value_as_money6;
331 my $max_value_as_money6;
333 my $min_value_as_money8;
334 my $max_value_as_money8;
336 my $min_value_as_date;
337 my $max_value_as_date;
339 my $min_value_as_timestamptz;
340 my $max_value_as_timestamptz;
342 my $dbh = Mig::db_connect();
348 $sth = $dbh->prepare("
350 FROM $MIGSCHEMA.$table;
352 $rv = $sth->execute()
353 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
354 @cols = $sth->fetchrow_array;
359 $sth = $dbh->prepare("
361 FROM $MIGSCHEMA.$table
362 WHERE $column IS NOT NULL AND BTRIM($column) <> '';
364 $rv = $sth->execute()
365 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
366 @cols = $sth->fetchrow_array;
368 $non_empty_count = $cols[0];
370 ### distinct_value_count
371 $sth = $dbh->prepare("
372 SELECT COUNT(DISTINCT $column)
373 FROM $MIGSCHEMA.$table;
375 $rv = $sth->execute()
376 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
377 @cols = $sth->fetchrow_array;
379 $distinct_value_count = $cols[0];
381 ### distinct_integer_value_count
382 $sth = $dbh->prepare("
383 SELECT COUNT(DISTINCT migration_tools.attempt_cast($column,'INTEGER')::INTEGER)
384 FROM $MIGSCHEMA.$table;
386 $rv = $sth->execute()
387 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
388 @cols = $sth->fetchrow_array;
390 $distinct_integer_value_count = $cols[0];
392 ### distinct_money6_value_count
393 $sth = $dbh->prepare("
394 SELECT COUNT(DISTINCT NULLIF(migration_tools.attempt_money6($column,'-0.01'),-0.01))
395 FROM $MIGSCHEMA.$table;
397 $rv = $sth->execute()
398 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
399 @cols = $sth->fetchrow_array;
401 $distinct_money6_value_count = $cols[0];
403 ### distinct_money8_value_count
404 $sth = $dbh->prepare("
405 SELECT COUNT(DISTINCT NULLIF(migration_tools.attempt_money($column,'-0.01'),-0.01))
406 FROM $MIGSCHEMA.$table;
408 $rv = $sth->execute()
409 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
410 @cols = $sth->fetchrow_array;
412 $distinct_money8_value_count = $cols[0];
414 ### distinct_date_value_count
415 $sth = $dbh->prepare("
416 SELECT COUNT(DISTINCT NULLIF(migration_tools.attempt_date($column,'1969-06-09'),'1969-06-09'::DATE))
417 FROM $MIGSCHEMA.$table;
419 $rv = $sth->execute()
420 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
421 @cols = $sth->fetchrow_array;
423 $distinct_date_value_count = $cols[0];
425 ### distinct_timestamptz_value_count
426 $sth = $dbh->prepare("
427 SELECT COUNT(DISTINCT NULLIF(migration_tools.attempt_timestamptz($column,'1969-06-09'),'1969-06-09'::TIMESTAMPTZ))
428 FROM $MIGSCHEMA.$table;
430 $rv = $sth->execute()
431 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
432 @cols = $sth->fetchrow_array;
434 $distinct_timestamptz_value_count = $cols[0];
437 $sth = $dbh->prepare("
439 FROM $MIGSCHEMA.$table;
441 $rv = $sth->execute()
442 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
443 @cols = $sth->fetchrow_array;
445 $min_value = $cols[0];
448 $sth = $dbh->prepare("
449 SELECT MIN(LENGTH($column))
450 FROM $MIGSCHEMA.$table;
452 $rv = $sth->execute()
453 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
454 @cols = $sth->fetchrow_array;
456 $min_length = $cols[0];
458 ### min_length_min_value
459 $sth = $dbh->prepare("
461 FROM $MIGSCHEMA.$table
462 WHERE LENGTH($column) = $min_length;
464 $rv = $sth->execute()
465 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
466 @cols = $sth->fetchrow_array;
468 $min_length_min_value = $cols[0];
470 ### min_value_as_integer
471 $sth = $dbh->prepare("
472 SELECT MIN(migration_tools.attempt_cast($column,'INTEGER')::INTEGER)
473 FROM $MIGSCHEMA.$table;
475 $rv = $sth->execute()
476 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
477 @cols = $sth->fetchrow_array;
479 $min_value_as_integer = $cols[0];
481 ### min_value_as_money6
482 $sth = $dbh->prepare("
483 SELECT MIN(NULLIF(migration_tools.attempt_money6($column,'-0.01'),-0.01))
484 FROM $MIGSCHEMA.$table;
486 $rv = $sth->execute()
487 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
488 @cols = $sth->fetchrow_array;
490 $min_value_as_money6 = $cols[0];
492 ### min_value_as_money8
493 $sth = $dbh->prepare("
494 SELECT MIN(NULLIF(migration_tools.attempt_money($column,'-0.01'),-0.01))
495 FROM $MIGSCHEMA.$table;
497 $rv = $sth->execute()
498 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
499 @cols = $sth->fetchrow_array;
501 $min_value_as_money8 = $cols[0];
503 ### min_value_as_date
504 $sth = $dbh->prepare("
505 SELECT MIN(NULLIF(migration_tools.attempt_date($column,'1969-06-09'),'1969-06-09'::DATE))
506 FROM $MIGSCHEMA.$table;
508 $rv = $sth->execute()
509 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
510 @cols = $sth->fetchrow_array;
512 $min_value_as_date = $cols[0];
514 ### min_value_as_timestamptz
515 $sth = $dbh->prepare("
516 SELECT MIN(NULLIF(migration_tools.attempt_timestamptz($column,'1969-06-09'),'1969-06-09'::TIMESTAMPTZ))
517 FROM $MIGSCHEMA.$table;
519 $rv = $sth->execute()
520 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
521 @cols = $sth->fetchrow_array;
523 $min_value_as_timestamptz = $cols[0];
526 $sth = $dbh->prepare("
528 FROM $MIGSCHEMA.$table;
530 $rv = $sth->execute()
531 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
532 @cols = $sth->fetchrow_array;
534 $max_value = $cols[0];
537 $sth = $dbh->prepare("
538 SELECT MAX(LENGTH($column))
539 FROM $MIGSCHEMA.$table;
541 $rv = $sth->execute()
542 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
543 @cols = $sth->fetchrow_array;
545 $max_length = $cols[0];
547 ### max_length_max_value
548 $sth = $dbh->prepare("
550 FROM $MIGSCHEMA.$table
551 WHERE LENGTH($column) = $max_length;
553 $rv = $sth->execute()
554 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
555 @cols = $sth->fetchrow_array;
557 $max_length_max_value = $cols[0];
559 ### max_value_as_integer
560 $sth = $dbh->prepare("
561 SELECT MAX(migration_tools.attempt_cast($column,'INTEGER')::INTEGER)
562 FROM $MIGSCHEMA.$table;
564 $rv = $sth->execute()
565 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
566 @cols = $sth->fetchrow_array;
568 $max_value_as_integer = $cols[0];
570 ### max_value_as_money6
571 $sth = $dbh->prepare("
572 SELECT MAX(NULLIF(migration_tools.attempt_money6($column,'-0.01'),-0.01))
573 FROM $MIGSCHEMA.$table;
575 $rv = $sth->execute()
576 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
577 @cols = $sth->fetchrow_array;
579 $max_value_as_money6 = $cols[0];
581 ### max_value_as_money8
582 $sth = $dbh->prepare("
583 SELECT MAX(NULLIF(migration_tools.attempt_money($column,'-0.01'),-0.01))
584 FROM $MIGSCHEMA.$table;
586 $rv = $sth->execute()
587 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
588 @cols = $sth->fetchrow_array;
590 $max_value_as_money8 = $cols[0];
592 ### max_value_as_date
593 $sth = $dbh->prepare("
594 SELECT MAX(NULLIF(migration_tools.attempt_date($column,'1969-06-09'),'1969-06-09'::DATE))
595 FROM $MIGSCHEMA.$table;
597 $rv = $sth->execute()
598 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
599 @cols = $sth->fetchrow_array;
601 $max_value_as_date = $cols[0];
603 ### max_value_as_timestamptz
604 $sth = $dbh->prepare("
605 SELECT MAX(NULLIF(migration_tools.attempt_timestamptz($column,'1969-06-09'),'1969-06-09'::TIMESTAMPTZ))
606 FROM $MIGSCHEMA.$table;
608 $rv = $sth->execute()
609 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
610 @cols = $sth->fetchrow_array;
612 $max_value_as_timestamptz = $cols[0];
614 Mig::db_disconnect($dbh);
617 print "# of rows = $count\n";
618 print "# of non-empty rows = $non_empty_count\n";
619 print "# of distinct values (as text) = $distinct_value_count\n";
620 print "# of distinct values (as integer) = $distinct_integer_value_count\n";
621 print "# of distinct values (as money6) = $distinct_money6_value_count\n";
622 print "# of distinct values (as money8) = $distinct_money8_value_count\n";
623 print "# of distinct values (as date) = $distinct_date_value_count\n";
624 print "# of distinct values (as timestamptz) = $distinct_timestamptz_value_count\n";
626 print "minimum value (as text) = $min_value\n";
627 print "maximum value (as text) = $max_value\n";
629 print "minimum value length (as text) = $min_length (min value: $min_length_min_value)\n";
630 print "maximum value length (as text) = $max_length (max value: $max_length_max_value)\n";
632 print "minimum value (as integer) = " . ($min_value_as_integer ? $min_value_as_integer : '') . "\n";
633 print "maximum value (as integer) = " . ($max_value_as_integer ? $max_value_as_integer : '') . "\n";
635 print "minimum value (as money6) = " . ($min_value_as_money6 ? $min_value_as_money6 : '') . "\n";
636 print "maximum value (as money6) = " . ($max_value_as_money6 ? $max_value_as_money6 : '') . "\n";
638 print "minimum value (as money8) = " . ($min_value_as_money8 ? $min_value_as_money8 : '') . "\n";
639 print "maximum value (as money8) = " . ($max_value_as_money8 ? $max_value_as_money8 : '') . "\n";
641 print "minimum value (as date) = " . ($min_value_as_date ? $min_value_as_date : '') . "\n";
642 print "maximum value (as date) = " . ($max_value_as_date ? $max_value_as_date : '') . "\n";
644 print "minimum value (as timestamptz) = " . ($min_value_as_timestamptz ? $min_value_as_timestamptz : '') . "\n";
645 print "maximum value (as timestamptz) = " . ($max_value_as_timestamptz ? $max_value_as_timestamptz : '') . "\n";
650 my ($column,$option) = (shift,shift||"GROUP BY 1 ORDER BY 1");
652 my $dbh = Mig::db_connect();
656 $sth = $dbh->prepare(Mig::sql("
657 SELECT $column, COUNT(*)
658 FROM $MIGSCHEMA.$table
661 $rv = $sth->execute()
662 || die "Error retrieving data from $MIGSCHEMA.$table: $!";
665 open LESS, "|less -F";
666 printf LESS "%-30s", "Value:";
667 print LESS "Count:\n\n";
668 while (my @cols = $sth->fetchrow_array) {
669 my $value = $cols[0];
670 my $count = $cols[1];
671 printf LESS "%-30s", defined $value ? $value : '<NULL>';
672 print LESS "$count\n";
679 ###############################################################################
681 sub add_this_column {
683 if (!Mig::check_for_tracked_column($table,$column)) {
684 my $dbh = Mig::db_connect();
686 INSERT INTO $MIGSCHEMA.tracked_column (
692 " . $dbh->quote($file) . "
693 ," . $dbh->quote($fdata->{parent_table}) . "
694 ," . $dbh->quote($table) . "
695 ," . $dbh->quote($column) . "
697 ") || die "Error inserting into table $MIGSCHEMA.tracked_column: $!\n";
698 Mig::db_disconnect($dbh);
702 sub status_this_column {
704 my $data = Mig::status_this_column($table,$column);
706 add_this_column($column);
707 $data = Mig::status_this_column($table,$column);
709 if ($$data{parent_table}) {
710 my $dbh = Mig::db_connect();
711 my $sth = $dbh->prepare("
713 FROM $MIGSCHEMA.fields_requiring_mapping
714 WHERE table_name = " . $dbh->quote( $$data{parent_table} ) . "
715 AND column_name = " . $dbh->quote( $column ) . ";"
717 my $rv = $sth->execute()
718 || die "Error checking table (tracked_column) for $table.$column: $!";
719 my $data2 = $sth->fetchrow_hashref;
721 $$data{required} = 1;
723 $$data{required} = 0;
726 Mig::db_disconnect($dbh);
732 my ($column,$comment) = (shift,shift);
734 my $data = status_this_column($column);
735 my $dbh = Mig::db_connect();
737 UPDATE $MIGSCHEMA.tracked_column
738 SET comment = " . $dbh->quote($comment) . "
739 WHERE id = " . $dbh->quote($data->{id}) . ";
740 ") || die "Error updating table $MIGSCHEMA.tracked_column: $!\n";
741 Mig::db_disconnect($dbh);
746 my ($column,$transform) = (shift,shift);
748 my $data = status_this_column($column);
749 my $dbh = Mig::db_connect();
751 UPDATE $MIGSCHEMA.tracked_column
752 SET transform = " . $dbh->quote($transform) . "
753 WHERE id = " . $dbh->quote($data->{id}) . ";
754 ") || die "Error updating table $MIGSCHEMA.tracked_column: $!\n";
755 Mig::db_disconnect($dbh);
760 my ($column,$target) = (shift,shift);
761 my $target_table = $table;
762 my $target_column = $target;
764 if ($target =~ /^(.+)\.(.+)$/) {
768 my $data = status_this_column($column);
769 my $dbh = Mig::db_connect();
771 UPDATE $MIGSCHEMA.tracked_column
772 SET target_table = " . $dbh->quote($target_table) . "
773 ,target_column = " . $dbh->quote($target_column) . "
774 WHERE id = " . $dbh->quote($data->{id}) . ";
775 ") || die "Error updating table $MIGSCHEMA.tracked_column: $!\n";
776 Mig::db_disconnect($dbh);