"mig" tool

[migration-tools.git] / text / clean_csv
diff --git a/text/clean_csv b/text/clean_csv

new file mode 100755 (executable)

index 0000000..599a5d0
--- /dev/null
+++ b/text/clean_csv
@@ -0,0 +1,792 @@
+#!/usr/bin/perl -w
+use Storable;
+use Switch;
+use Getopt::Long;
+use Text::CSV_XS;
+use Text::CSV::Separator qw(get_separator);
+use Data::Dumper;
+use Term::ANSIColor;
+
+# may be manipulated with --config
+our %CSV_options = (
+    binary => 1,
+    auto_diag => 1,
+    diag_verbose => 1,
+);
+my $csv;
+our $fixes = {
+       'R' => [],
+       'I' => [],
+       'D' => []
+};
+my @parsed_rows;
+my @lines_with_errors = ();
+my %line_numbers_for_lines_with_errors = ();
+my $expected_column_count;
+my $line_no;
+
+# GetOpt variables
+my $config;
+my $id_cols;
+my $fix;
+my $nosave;
+my $apply;
+my $pad;
+my $truncate;
+my $backslash;
+my $debug;
+my $help;
+my $create_headers;
+my $headers_file;
+my @headers = ();
+
+my $pad_count = 0; my $trunc_count = 0; my $fix_count = 0; my $backslash_count = 0;
+
+################################################################## Subs
+
+sub format_for_display {
+       my $formatted_line = shift;
+       my $sep_char = $CSV_options{sep_char} || '\t';
+       my $sep = color 'bold blue';
+       $sep .= '<' . (ord($sep_char) < 32 ? ord($sep_char) : $sep_char) . '>';
+       $sep .= color 'reset';
+       my $quote_char = $CSV_options{quote_char} || '';
+       my $quote = color 'bold red';
+       $quote .= '<' . (ord($quote_char) < 32 ? ord($quote_char) : $quote_char) . '>';
+       $quote .= color 'reset';
+       my $escape_char = $CSV_options{escape_char} || '';
+       my $escape = color 'bold green';
+       $escape .= '<' . (ord($escape_char) < 32 ? ord($escape_char) : $escape_char) . '>';
+       $escape .= color 'reset';
+       my $real_escape_char = chr(27);
+       my $real_escape = color 'bold green';
+       $real_escape .= '<27>';
+       $real_escape .= color 'reset';
+
+       $formatted_line =~ s/$real_escape_char/$real_escape/g;
+       $formatted_line =~ s/$sep_char/$sep/g;
+       $formatted_line =~ s/$quote_char/$quote/g;
+       $formatted_line =~ s/$escape_char/$escape/g;
+       for (my $i = 0; $i < 32; $i++) {
+               if ($i == 27) { next; }
+               my $other_char = chr($i);
+               my $other = color 'yellow';
+               $other .= "<$i>";
+               $other .= color 'reset';
+               $formatted_line =~ s/$other_char/$other/g;
+       }
+       return "$formatted_line\n";
+}
+
+sub combine_cols {
+       my $row = shift;
+    my $status = $csv->combine(@{ $row });
+    if ($status && $csv->string) {
+        return $csv->string . "\n";
+    } else {
+        die $csv->error_input . "\n";
+    }
+}
+
+sub convert_backslashes {
+       my $line = shift;
+       my $altered_line;
+       my @count = $line =~ /\\/g;
+       if (scalar(@count) > 0) {
+               my $csv2 = Text::CSV_XS->new(\%CSV_options);
+               if ($csv2->parse($line)) {
+                       my @columns = $csv2->fields();
+                       foreach my $c (@columns) {
+                               if ($c ne '\N') {
+                                       $c =~ s/\\/\//g;
+                               }
+                       }
+                       $altered_line = combine_cols(\@columns);
+               } else {
+                       $altered_line =~ s/\\/\//g;
+               }
+               if ($line ne $altered_line) {
+                       $backslash_count += scalar(@count);
+                       print "\nline#$line_no>> Converting " . scalar(@count) . " backslashes to forward slashes\n";
+                       print "before: " . format_for_display($line);
+                       $line = $altered_line;
+                       print " after: " . format_for_display($line);
+               }
+       }
+       return $line;
+}
+
+sub apply_line_fixes {
+       my $line = shift;
+       foreach my $fix ( @{$fixes->{'R'}} ) {
+               my $id_regex = $fix->[0];
+               if ($line =~ /$id_regex/) {
+                       print "\nline#$line_no>> Applying regex fix for $id_regex\n";
+                       $fix_count++;
+                       my $regex1 = $fix->[1];
+                       my $regex2 = $fix->[2];
+                       my $global = $fix->[3];
+                       my $ignore_case = $fix->[4];
+                       print "before: " . format_for_display($line);
+                       $line = fix_via_regex($id_regex,$line,$regex1,$regex2,$global,$ignore_case);
+                       print " after: " . format_for_display($line);
+               }
+       }
+       return $line;
+}
+
+sub apply_insert_fixes {
+       my $line = shift;
+       my $cols = shift;
+       foreach my $fix ( @{$fixes->{'I'}} ) {
+               my $id_regex = $fix->[0];
+               my $col_count_check = $fix->[1];
+               if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
+                       print "\nline#$line_no>> Applying insert for $id_regex\n";
+                       $fix_count++;
+                       my $fix_cols = $fix->[2];
+                       my $fix_value = $fix->[3];
+                       print "before: " . format_for_display($line);
+                       $line = fix_via_insert($id_regex,$cols,$col_count_check,$fix_cols,$fix_value);
+                       print " after: " . format_for_display($line);
+               }
+       }
+       return $line;
+}
+
+sub apply_delete_fixes {
+       my $line = shift;
+       my $cols = shift;
+       foreach my $fix ( @{$fixes->{'D'}} ) {
+               my $id_regex = $fix->[0];
+               my $col_count_check = $fix->[1];
+               if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
+                       print "\nline#$line_no>> Applying delete for $id_regex\n";
+                       $fix_count++;
+                       my $fix_cols = $fix->[2];
+                       print "before: " . format_for_display($line);
+                       $line = fix_via_delete($id_regex,$cols,$col_count_check,$fix_cols);
+                       print " after: " . format_for_display($line);
+               }
+       }
+       return $line;
+}
+
+sub apply_join_fixes {
+       my $line = shift;
+       my $cols = shift;
+       foreach my $fix ( @{$fixes->{'J'}} ) {
+               my $id_regex = $fix->[0];
+               my $col_count_check = $fix->[1];
+               if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
+                       print "\nline#$line_no>> Applying join for $id_regex\n";
+                       $fix_count++;
+                       my $fix_cols = $fix->[2];
+                       print "before: " . format_for_display($line);
+                       $line = fix_via_join($id_regex,$cols,$col_count_check,$fix_cols);
+                       print " after: " . format_for_display($line);
+               }
+       }
+       return $line;
+}
+
+sub save_fix {
+       print "saving fix...";
+       my $type = shift;
+       my $fix = shift;
+       if ($nosave) { print "psyche!\n"; return; }
+       print "fix = " . Dumper($fix) . "\n" if $debug;
+       push @{$fixes->{$type}}, $fix;
+       store $fixes, $ARGV[0] . '.fixes';
+       print "saved\n";
+}
+
+sub id_cols_regex {
+       my $cols = shift;
+       my @f = sort(split /,/, $id_cols || '0');
+       my $regex = '';
+       for (my $i = 0; $i < scalar(@f); $i++) {
+               if ($i > 0) {
+                       $regex .= '.+'; # characters between id columns
+               }
+               $regex .= '.?' . $cols->[$f[$i]] . '.?';
+       }
+       $regex .= '';
+       return $regex;
+}
+
+sub fix_via_regex {
+       my $id_regex = shift;
+       my $line = shift;
+       my $fix_regex1 = shift;
+       my $fix_regex2 = shift;
+       my $global = shift;
+       my $ignore_case = shift;
+       my $save;
+
+       if (!$fix_regex1) {
+global_prompt:
+               print "Global (aka s/match/replace/g)? <Yes/No> [n] ";
+               $global = readline(STDIN); chomp $global;
+               $global = uc(substr($global,0,1));
+               if ($global eq '') {
+                       $global = 'N';
+               }
+               $global = uc(substr($global,0,1));
+               if ($global ne 'Y' && $global ne 'N') {
+                       goto global_prompt;
+               }
+case_prompt:
+               print "Ignore-case (aka s/match/replace/i)? <Yes/No> [n] ";
+               $ignore_case = readline(STDIN); chomp $ignore_case;
+               $ignore_case = uc(substr($ignore_case,0,1));
+               if ($ignore_case eq '') {
+                       $ignore_case = 'N';
+               }
+               $ignore_case = uc(substr($ignore_case,0,1));
+               if ($ignore_case ne 'Y' && $ignore_case ne 'N') {
+                       goto case_prompt;
+               }
+
+regex1_prompt:
+               print "Enter match regex for s/match/replace/: ";
+               $fix_regex1 = readline(STDIN); chomp $fix_regex1;
+               if ($fix_regex1 eq '') {
+                       goto global_prompt;
+               }
+               if (
+                       ($global eq 'Y' && $ignore_case eq 'Y' && $line =~ /$fix_regex1/gi) 
+                       || ($global eq 'Y' && $ignore_case eq 'N' && $line =~ /$fix_regex1/g) 
+                       || ($global eq 'N' && $ignore_case eq 'N' && $line =~ /$fix_regex1/i) 
+               ) {
+                       print "Regex matches line.\n";
+               } else {
+                       print "Regex does not match line.\n";
+                       goto regex1_prompt;
+               }
+regex2_prompt:
+               print "Enter replace regex for s/match/replace/: ";
+               $fix_regex2 = readline(STDIN); chomp $fix_regex2;
+               if (substr($fix_regex1,-1) eq '$') {
+                       print "Adding new line to end of /$fix_regex2/ based on \$ in /$fix_regex1/\n";
+                       $fix_regex2 .= "\n";
+               }
+               # TODO - how to do we handle backreferences with this?
+               $save = 1;
+       }
+
+       switch ($global . $ignore_case) {
+               case 'YY'       { $line =~ s/$fix_regex1/$fix_regex2/gi; }
+               case 'YN'       { $line =~ s/$fix_regex1/$fix_regex2/g; }
+               case 'NY'       { $line =~ s/$fix_regex1/$fix_regex2/i; }
+               case 'NN'       { $line =~ s/$fix_regex1/$fix_regex2/; }
+       }
+
+       if ($save) {
+               save_fix('R',[
+                       $id_regex,
+                       $fix_regex1,
+                       $fix_regex2,
+                       $global,
+                       $ignore_case
+               ]);
+       }
+
+       return $line;
+}
+
+sub fix_via_insert {
+       my $id_regex = shift;
+       my $cols = shift;
+       my $col_count_check = shift;
+       my $fix_cols = shift;
+       my $fix_value = shift;
+       my $line;
+       my $save;
+
+       if (!$fix_cols) {
+               $col_count_check = scalar(@{$cols});
+               print "This fix will only trigger when the number of columns is $col_count_check.\n";
+               print "Enter value to insert: [] ";
+               $fix_value = readline(STDIN); chomp $fix_value;
+               print "Enter comma-separated list of column positions (0-based) for insertion: ";
+               $fix_cols = readline(STDIN); chomp $fix_cols;
+               $save = 1;
+       }
+
+       if ($col_count_check != scalar(@{$cols})) {
+               print "WARNING: Insert column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ".  Skipping.\n";
+               return $line;
+       }
+       
+       my @f = sort(split /,/, $fix_cols);
+       for (my $i = 0; $i < scalar(@f); $i++) {
+               splice @{ $cols }, $f[$i] + $i, 0, $fix_value;
+       }
+
+       eval {
+               $line = combine_cols($cols);
+       };
+       if ($@) {
+               print "fix_via_insert error:\n";
+               die $@;
+       }
+
+       if ($save) {
+               save_fix('I',[
+                       $id_regex,
+                       $col_count_check,
+                       $fix_cols,
+                       $fix_value
+               ]);
+       }
+
+       return $line;
+}
+
+sub fix_via_delete {
+       my $id_regex = shift;
+       my $cols = shift;
+       my $col_count_check = shift;
+       my $fix_cols = shift;
+       my $line;
+       my $save;
+
+       if (!$fix_cols) {
+               $col_count_check = scalar(@{$cols});
+               print "This fix will only trigger when the number of columns is $col_count_check.\n";
+               print "Enter comma-separated list of column positions (0-based) to delete: ";
+               $fix_cols = readline(STDIN); chomp $fix_cols;
+               $save = 1;
+       }
+
+       if ($col_count_check != scalar(@{$cols})) {
+               print "WARNING: Delete column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ".  Skipping.\n";
+               return $line;
+       }
+       
+       my @f = sort(split /,/, $fix_cols);
+       for (my $i = 0; $i < scalar(@f); $i++) {
+               splice @{ $cols }, $f[$i] - $i, 1;
+       }
+
+       eval {
+               $line = combine_cols($cols);
+       };
+       if ($@) {
+               print "fix_via_delete error:\n";
+               die $@;
+       }
+
+       if ($save) {
+               save_fix('D',[
+                       $id_regex,
+                       $col_count_check,
+                       $fix_cols
+               ]);
+       }
+
+       return $line;
+}
+
+sub fix_via_join {
+       my $id_regex = shift;
+       my $cols = shift;
+       my $col_count_check = shift;
+       my $fix_cols = shift;
+       my $line;
+       my $save;
+
+       if (!$fix_cols) {
+               $col_count_check = scalar(@{$cols});
+               print "This fix will only trigger when the number of columns is $col_count_check.\n";
+               print "Enter comma-separated list of column positions (0-based) to join: ";
+               $fix_cols = readline(STDIN); chomp $fix_cols;
+               $save = 1;
+       }
+
+       if ($col_count_check != scalar(@{$cols})) {
+               print "WARNING: Join column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ".  Skipping.\n";
+               return $line;
+       }
+
+       # gather
+       my $fix_value = '';
+       my @f = sort { $a <=> $b } (split /,/, $fix_cols);
+       for (my $i = 0; $i < scalar(@f); $i++) {
+               $fix_value .= @{ $cols }[$f[$i]];
+       }
+
+       # delete        
+       for (my $i = 0; $i < scalar(@f); $i++) {
+               splice @{ $cols }, $f[$i] - $i, 1;
+       }
+
+       # insert
+       splice @{ $cols }, $f[0], 0, $fix_value;
+
+       eval {
+               $line = combine_cols($cols);
+       };
+       if ($@) {
+               print "fix_via_join error:\n";
+               die $@;
+       }
+
+       if ($save) {
+               save_fix('J',[
+                       $id_regex,
+                       $col_count_check,
+                       $fix_cols
+               ]);
+       }
+
+       return $line;
+}
+
+
+sub manual_fix {
+       my $line = shift;
+       my $cols = shift;
+       my $insert_delete_allowed = shift;
+       my $col_count = scalar(@{$cols}) > scalar(@headers) ? scalar(@{$cols}) : scalar(@headers);
+       my $max_header_length = 0;
+
+       # display columns nice and formatted
+       for (my $i = 0; $i < scalar(@headers); $i++) {
+               if (length($headers[$i]) > $max_header_length) {
+                       $max_header_length = length($headers[$i]);
+               }
+       }
+       for (my $i = 0; $i < $col_count; $i++) {
+               printf "#% d ", $i;
+               if (defined $headers[$i]) {
+                       print " " x ($max_header_length - length($headers[$i]));
+                       print $headers[$i];
+               } else {
+                       print " " x $max_header_length;
+               }
+               print ": ";
+               if (defined $cols->[$i]) {
+                       print "<" . $cols->[$i] . ">";
+               }
+               print "\n";
+       }
+
+       # prompt for type of fix
+fix_prompt:
+       print "\n" . format_for_display($line) . "\nFix line#$line_no? <Regex" . ($insert_delete_allowed ? '|Insert|Delete|Join' : '') . "|No> [n] ";
+       my $ans = readline(STDIN); chomp $ans;
+       $ans = uc(substr($ans,0,1));
+       if ($ans eq '') {
+               $ans = 'N';
+       }
+       if ($insert_delete_allowed ? index("RIDJN",$ans)==-1 : index("RN",$ans)==-1) {
+               goto fix_prompt;
+       }
+
+       # prompt for matching condition
+       my $id_regex;
+       if ($ans ne 'N') {
+               my $default_id_regex = id_cols_regex($cols);
+match_prompt:
+               print "If matching the end of the string, you may need to use \\s*\$ instead of \$\n";
+               print "Insert/Delete/Join fixes will also filter on column count.\n";
+               print "Identify this line (and optionally similar lines) with regex: [$default_id_regex] ";
+               $id_regex = readline(STDIN); chomp $id_regex;
+               if ($id_regex eq '') {
+                       $id_regex = $default_id_regex;
+               }
+               if ($line =~ /$id_regex/) {
+                       print "Regex matches line.\n";
+               } else {
+                       print "Regex does not match line.\n";
+                       goto match_prompt;
+               }
+       }
+
+       # prompt and perform actual fixes
+       switch($ans) {
+               case 'R'        { $line = fix_via_regex($id_regex,$line); }
+               case 'I'        { $line = fix_via_insert($id_regex,$cols); }
+               case 'D'        { $line = fix_via_delete($id_regex,$cols); }
+               case 'J'        { $line = fix_via_join($id_regex,$cols); }
+               case 'N'        { }
+               else            { $ans = 'N'; }
+       }
+
+       if ($ans ne 'N') {
+               print "\nNew line#$line_no: $line";
+       }
+       return ( $ans, $line );
+}
+
+################################################################## Init
+GetOptions(
+       'config=s' => \$config,
+       'idcols=s' => \$id_cols,
+       'create-headers' => \$create_headers,
+       'use-headers=s' => \$headers_file,
+       'fix' => \$fix,
+       'nosave' => \$nosave,
+       'apply' => \$apply,
+       'pad' => \$pad,
+       'truncate' => \$truncate,
+       'backslash' => \$backslash,
+       'debug' => \$debug,
+       'help|?' => \$help
+);
+if ($help || ((@ARGV == 0) && (-t STDIN))) {
+       die "\n\t$0 [--config <CONFIG>] [--idcols <idx1,idx2,...>] [--fix] [--apply] [--pad] [--truncate] <FILE>\n\n"
+               . "\tExpects <FILE> to be a CSV-like UTF-8 encoded file.\n"
+               . "\tWill produce <FILE>.clean and <FILE>.error versions of said file.\n\n"
+               . "\t--config <CONFIG> will read the Perl file <CONFIG> for settings information.  See 'Example Config' below\n\n"
+               . "\t--create-headers will generate headers like so: col1, col2, col3, etc.\n"
+               . "\t--use-headers <HFILE> will generate headers based on the specified <HFILE>, which must contain one column header per line.\n"
+               . "\t(if neither --create-headers nor --use-headers are specified, then the first line in <FILE> is assumed to contain the column headers)\n\n"
+               . "\t--fix will prompt for whether and how to fix broken records, and save those fixes in <FILE>.fixes\n"
+               . "\t--idcols <idx1,idx2,...> takes a comma-separated list of column indexes (starting with 0) to use as matchpoint suggestions for fixes\n"
+               . "\t--nosave will prevent new fixes from being saved in <FILE>.fixes\n"
+               . "\t--apply will apply previously recorded fixes from <FILE>.fixes\n\n"
+               . "\t--pad will fill in missing columns at the end if needed for otherwise unbroken records\n"
+               . "\t--truncate will strip extra columns from the end if needed for otherwise unbroken records\n"
+               . "\t--backslash will convert backslashes into forward slashes\n\n"
+               . "\t Example Config:\n\n"
+               . "\t\t\$CSV_options{quote_char} = '\"';\n"
+               . "\t\t\$CSV_options{escape_char} = '\"';\n"
+               . "\t\t\$CSV_options{sep_char} = ',';\n"
+               . "\t\t\$CSV_options{eol} = \$\\;\n"
+               . "\t\t\$CSV_options{always_quote} = 0;\n"
+               . "\t\t\$CSV_options{quote_space} = 1;\n"
+               . "\t\t\$CSV_options{quote_null} = 1;\n"
+               . "\t\t\$CSV_options{quote_binary} = 1;\n"
+               . "\t\t\$CSV_options{binary} = 0;\n"
+               . "\t\t\$CSV_options{decode_utf8} = 1;\n"
+               . "\t\t\$CSV_options{keep_meta_info} = 0;\n"
+               . "\t\t\$CSV_options{allow_loose_quotes} = 0;\n"
+               . "\t\t\$CSV_options{allow_loose_escapes} = 0;\n"
+               . "\t\t\$CSV_options{allow_unquoted_escape} = 0;\n"
+               . "\t\t\$CSV_options{allow_whitespace} = 0;\n"
+               . "\t\t\$CSV_options{blank_is_undef} = 0;\n"
+               . "\t\t\$CSV_options{empty_is_undef} = 0;\n"
+               . "\t\t\$CSV_options{verbatim} = 0;\n"
+               . "\n\n";
+}
+if (! -e $ARGV[0]) {
+       die "$ARGV[0] does not exist\n";
+}
+if ($config && ! -e $config) {
+       die "$config does not exist\n";
+}
+if ($apply && -e $ARGV[0] . '.fixes') {
+       $fixes = retrieve($ARGV[0] . '.fixes');
+}
+
+################################################################## CSV Setup
+$CSV_options{sep_char} = get_separator( path => $ARGV[0], lucky => 1 );
+if ($config && -e $config) {
+       do $config;
+}
+$csv = Text::CSV_XS->new(\%CSV_options);
+$csv->callbacks(
+    error => sub {
+        my ($err, $msg, $pos, $recno) = @_;
+        return if ($err == 2012);
+               $line_numbers_for_lines_with_errors{$line_no} = 1;
+        print "\nline#$line_no * $err : $msg -> (pos#$pos,rec#$recno)\n";
+        if ($csv->error_input) {
+            print $csv->error_input;
+            print "-" x ($pos - 1);
+            print "^\n";
+        }
+        $csv->SetDiag(0);
+        return;
+    }
+);
+
+################################################################## Reading
+
+if ($headers_file) {
+       print "_.,-~= reading $headers_file\n";
+       open my $hfile, "<:encoding(utf8)", $headers_file or die "$headers_file: $!";
+       while (my $line = <$hfile>) {
+               chomp $line;
+               $line =~ s/\s+$//;
+               $line =~ s/^\s+//;
+               push @headers, $line;
+       }
+       close $hfile;
+       $expected_column_count = scalar(@headers);
+       print "Expected column count set to $expected_column_count based on headers.\n";
+}
+
+print "_.,-~= reading $ARGV[0]\n";
+open my $in, "<:encoding(utf8)", $ARGV[0] or die "$ARGV[0]: $!";
+$line_no = 1; 
+while (my $line = <$in>) {
+       print ">>> main loop (#$line_no): $line" if $debug;
+       if ($backslash) {
+               $line = convert_backslashes($line);
+       }
+       if ($apply) {
+               $line = apply_line_fixes($line);
+       }
+       if ($csv->parse($line)) {
+               my @columns = $csv->fields();
+               if (! $expected_column_count) {
+                       $expected_column_count = scalar(@columns);
+                       print "Expected column count set to $expected_column_count based on first row.\n";
+                       for (my $i = 0; $i < scalar(@columns) ; $i++) {
+                               if ($create_headers) {
+                                       push @headers, "col" . ($i+1);
+                               } else {
+                                       push @headers, $columns[$i];
+                               }
+                       }
+               }
+               if (defined $line_numbers_for_lines_with_errors{$line_no}) {
+                       if ($fix) {
+                               my $fix_status;
+                               ($fix_status,$line) = manual_fix($line,\@columns,0); # Regex only
+                               if ($fix_status ne 'N') {
+                                       delete $line_numbers_for_lines_with_errors{$line_no};
+                                       $fix_count++;
+                                       redo;
+                               }
+                       }
+               } else {
+                       if (scalar(@columns) < $expected_column_count) {
+                               if ($apply) {
+                                       my $new_line = apply_insert_fixes($line,\@columns);
+                                       if ($line ne $new_line) {
+                                               $line = $new_line;
+                                               redo;
+                                       }
+                               }
+                               if ($pad) {
+                                       $pad_count++;
+                                       print "\nline#$line_no>> padding line, from " . scalar(@columns) . " columns ";
+                                       my $col_count = scalar(@columns);
+                                       for (my $i = 0; $i < $expected_column_count - $col_count; $i++) {
+                                               push @columns, '#pad#';
+                                       }
+                                       print "to " . scalar(@columns) . " columns.\n";
+                                       eval {
+                                               print "before: " . format_for_display($line);
+                                               $line = combine_cols(\@columns);
+                                               print " after: " . format_for_display($line);
+                                       };
+                                       if ($@) {
+                                               print "padding error:\n";
+                                               die $@;
+                                       }
+                                       redo;
+                               }
+                       }
+                       if (scalar(@columns) > $expected_column_count) {
+                               if ($apply) {
+                                       my $new_line = apply_delete_fixes($line,\@columns);
+                                       if ($line ne $new_line) {
+                                               $line = $new_line;
+                                               redo;
+                                       }
+                                       $new_line = apply_join_fixes($line,\@columns);
+                                       if ($line ne $new_line) {
+                                               $line = $new_line;
+                                               redo;
+                                       }
+                               }
+                               if ($truncate) {
+                                       $trunc_count++;
+                                       print "\nline#$line_no>> truncating line, from " . scalar(@columns) . " columns ";
+                                       splice @columns, $expected_column_count;
+                                       print "to " . scalar(@columns) . " columns.\n";
+                                       eval {
+                                               print "before: " . format_for_display($line);
+                                               $line = combine_cols(\@columns);
+                                               print " after: " . format_for_display($line);
+                                       };
+                                       if ($@) {
+                                               print "truncating error:\n";
+                                               die $@;
+                                       }
+                                       redo;
+                               }
+                       }
+                       if (scalar(@columns) != $expected_column_count) {
+                               # so broken, but parseable, and thus not handled by the error callback
+                               print "\nline#$line_no * Expected $expected_column_count columns but found " . scalar(@columns) . "\n$line";
+                               print "-" x length($line) . "\n";
+                               $line_numbers_for_lines_with_errors{$line_no} = 1;
+                               if ($fix) {
+                                       my $fix_status;
+                                       ($fix_status,$line) = manual_fix($line,\@columns,1); # Insert/Delete allowed
+                                       if ($fix_status ne 'N') {
+                                               delete $line_numbers_for_lines_with_errors{$line_no};
+                                               $fix_count++;
+                                               redo;
+                                       }
+                               }
+                       }
+               }
+               if (defined $line_numbers_for_lines_with_errors{$line_no}) {
+                       print "\tIncrementing errors with line# $line_no\n" if $debug;
+                       push @lines_with_errors, $line;
+               } else {
+                       print "\tIncrementing clean with line# $line_no\n" if $debug;
+                       push @parsed_rows, \@columns;
+               }
+               $line_no++;
+       } else {
+               die "Parsing error:\n" . $csv->error_input . "\n";
+       }
+}
+close $in;
+print "_.,-~= read " . ($line_no-1) . " records ";
+print "(" . scalar(@lines_with_errors) . " with errors, $pad_count auto-padded, $trunc_count auto-truncated, $backslash_count backslashes converted, $fix_count manual fixes)\n";
+
+
+################################################################## Writing good CSV
+
+print "_.,-~= writing $ARGV[0].clean\n";
+open my $out, ">:encoding(utf8)", "$ARGV[0].clean" or die "$ARGV[0].clean: $!";
+$line_no = 1;
+$actual_count = 0;
+if ($create_headers || $headers_file) {
+       unshift @parsed_rows, \@headers;
+}
+foreach my $row (@parsed_rows) {
+       eval {
+               $line = combine_cols($row);
+               print $out $line;
+       };
+       if ($@) {
+               print "error:\n";
+               die $@;
+       }
+       $actual_count++;
+}
+close $out;
+print "_.,-~= wrote " . ($actual_count) . " records\n";
+
+
+################################################################## Writing broken CSV
+
+print "_.,-~= writing $ARGV[0].error\n";
+open my $out2, ">:encoding(utf8)", "$ARGV[0].error" or die "$ARGV[0].error: $!";
+foreach my $row (@lines_with_errors) {
+       print $out2 $row;
+}
+close $out2;
+print "_.,-~= wrote " . (scalar @lines_with_errors) . " records\n";
+
+
+################################################################## .no_headers version
+
+print "_.,-~= creating $ARGV[0].clean.no_headers\n";
+
+print `tail -n +2 $ARGV[0].clean > $ARGV[0].clean.no_headers`;
+
+################################################################## Finished
+
+print "_.,-~= finished\n";