X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=text%2Fclean_csv;fp=text%2Fclean_csv;h=599a5d06da28c39855e559a21875df40bbac60a4;hp=0000000000000000000000000000000000000000;hb=f9201dc2d1699f5161e5e29690a1634e8063bb85;hpb=d1812fa8c4c9e220978d650adb3611c978a2a56b

diff --git a/text/clean_csv b/text/clean_csv
new file mode 100755
index 0000000..599a5d0
--- /dev/null
+++ b/text/clean_csv
@@ -0,0 +1,792 @@
+#!/usr/bin/perl -w
+use Storable;
+use Switch;
+use Getopt::Long;
+use Text::CSV_XS;
+use Text::CSV::Separator qw(get_separator);
+use Data::Dumper;
+use Term::ANSIColor;
+
+# may be manipulated with --config
+our %CSV_options = (
+    binary => 1,
+    auto_diag => 1,
+    diag_verbose => 1,
+);
+my $csv;
+our $fixes = {
+	'R' => [],
+	'I' => [],
+	'D' => []
+};
+my @parsed_rows;
+my @lines_with_errors = ();
+my %line_numbers_for_lines_with_errors = ();
+my $expected_column_count;
+my $line_no;
+
+# GetOpt variables
+my $config;
+my $id_cols;
+my $fix;
+my $nosave;
+my $apply;
+my $pad;
+my $truncate;
+my $backslash;
+my $debug;
+my $help;
+my $create_headers;
+my $headers_file;
+my @headers = ();
+
+my $pad_count = 0; my $trunc_count = 0; my $fix_count = 0; my $backslash_count = 0;
+
+################################################################## Subs
+
+sub format_for_display {
+	my $formatted_line = shift;
+	my $sep_char = $CSV_options{sep_char} || '\t';
+	my $sep = color 'bold blue';
+	$sep .= '<' . (ord($sep_char) < 32 ? ord($sep_char) : $sep_char) . '>';
+	$sep .= color 'reset';
+	my $quote_char = $CSV_options{quote_char} || '';
+	my $quote = color 'bold red';
+	$quote .= '<' . (ord($quote_char) < 32 ? ord($quote_char) : $quote_char) . '>';
+	$quote .= color 'reset';
+	my $escape_char = $CSV_options{escape_char} || '';
+	my $escape = color 'bold green';
+	$escape .= '<' . (ord($escape_char) < 32 ? ord($escape_char) : $escape_char) . '>';
+	$escape .= color 'reset';
+	my $real_escape_char = chr(27);
+	my $real_escape = color 'bold green';
+	$real_escape .= '<27>';
+	$real_escape .= color 'reset';
+
+	$formatted_line =~ s/$real_escape_char/$real_escape/g;
+	$formatted_line =~ s/$sep_char/$sep/g;
+	$formatted_line =~ s/$quote_char/$quote/g;
+	$formatted_line =~ s/$escape_char/$escape/g;
+	for (my $i = 0; $i < 32; $i++) {
+		if ($i == 27) { next; }
+		my $other_char = chr($i);
+		my $other = color 'yellow';
+		$other .= "<$i>";
+		$other .= color 'reset';
+		$formatted_line =~ s/$other_char/$other/g;
+	}
+	return "$formatted_line\n";
+}
+
+sub combine_cols {
+	my $row = shift;
+    my $status = $csv->combine(@{ $row });
+    if ($status && $csv->string) {
+        return $csv->string . "\n";
+    } else {
+        die $csv->error_input . "\n";
+    }
+}
+
+sub convert_backslashes {
+	my $line = shift;
+	my $altered_line;
+	my @count = $line =~ /\\/g;
+	if (scalar(@count) > 0) {
+		my $csv2 = Text::CSV_XS->new(\%CSV_options);
+		if ($csv2->parse($line)) {
+			my @columns = $csv2->fields();
+			foreach my $c (@columns) {
+				if ($c ne '\N') {
+					$c =~ s/\\/\//g;
+				}
+			}
+			$altered_line = combine_cols(\@columns);
+		} else {
+			$altered_line =~ s/\\/\//g;
+		}
+		if ($line ne $altered_line) {
+			$backslash_count += scalar(@count);
+			print "\nline#$line_no>> Converting " . scalar(@count) . " backslashes to forward slashes\n";
+			print "before: " . format_for_display($line);
+			$line = $altered_line;
+			print " after: " . format_for_display($line);
+		}
+	}
+	return $line;
+}
+
+sub apply_line_fixes {
+	my $line = shift;
+	foreach my $fix ( @{$fixes->{'R'}} ) {
+		my $id_regex = $fix->[0];
+		if ($line =~ /$id_regex/) {
+			print "\nline#$line_no>> Applying regex fix for $id_regex\n";
+			$fix_count++;
+			my $regex1 = $fix->[1];
+			my $regex2 = $fix->[2];
+			my $global = $fix->[3];
+			my $ignore_case = $fix->[4];
+			print "before: " . format_for_display($line);
+			$line = fix_via_regex($id_regex,$line,$regex1,$regex2,$global,$ignore_case);
+			print " after: " . format_for_display($line);
+		}
+	}
+	return $line;
+}
+
+sub apply_insert_fixes {
+	my $line = shift;
+	my $cols = shift;
+	foreach my $fix ( @{$fixes->{'I'}} ) {
+		my $id_regex = $fix->[0];
+		my $col_count_check = $fix->[1];
+		if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
+			print "\nline#$line_no>> Applying insert for $id_regex\n";
+			$fix_count++;
+			my $fix_cols = $fix->[2];
+			my $fix_value = $fix->[3];
+			print "before: " . format_for_display($line);
+			$line = fix_via_insert($id_regex,$cols,$col_count_check,$fix_cols,$fix_value);
+			print " after: " . format_for_display($line);
+		}
+	}
+	return $line;
+}
+
+sub apply_delete_fixes {
+	my $line = shift;
+	my $cols = shift;
+	foreach my $fix ( @{$fixes->{'D'}} ) {
+		my $id_regex = $fix->[0];
+		my $col_count_check = $fix->[1];
+		if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
+			print "\nline#$line_no>> Applying delete for $id_regex\n";
+			$fix_count++;
+			my $fix_cols = $fix->[2];
+			print "before: " . format_for_display($line);
+			$line = fix_via_delete($id_regex,$cols,$col_count_check,$fix_cols);
+			print " after: " . format_for_display($line);
+		}
+	}
+	return $line;
+}
+
+sub apply_join_fixes {
+	my $line = shift;
+	my $cols = shift;
+	foreach my $fix ( @{$fixes->{'J'}} ) {
+		my $id_regex = $fix->[0];
+		my $col_count_check = $fix->[1];
+		if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
+			print "\nline#$line_no>> Applying join for $id_regex\n";
+			$fix_count++;
+			my $fix_cols = $fix->[2];
+			print "before: " . format_for_display($line);
+			$line = fix_via_join($id_regex,$cols,$col_count_check,$fix_cols);
+			print " after: " . format_for_display($line);
+		}
+	}
+	return $line;
+}
+
+sub save_fix {
+	print "saving fix...";
+	my $type = shift;
+	my $fix = shift;
+	if ($nosave) { print "psyche!\n"; return; }
+	print "fix = " . Dumper($fix) . "\n" if $debug;
+	push @{$fixes->{$type}}, $fix;
+	store $fixes, $ARGV[0] . '.fixes';
+	print "saved\n";
+}
+
+sub id_cols_regex {
+	my $cols = shift;
+	my @f = sort(split /,/, $id_cols || '0');
+	my $regex = '';
+	for (my $i = 0; $i < scalar(@f); $i++) {
+		if ($i > 0) {
+			$regex .= '.+'; # characters between id columns
+		}
+		$regex .= '.?' . $cols->[$f[$i]] . '.?';
+	}
+	$regex .= '';
+	return $regex;
+}
+
+sub fix_via_regex {
+	my $id_regex = shift;
+	my $line = shift;
+	my $fix_regex1 = shift;
+	my $fix_regex2 = shift;
+	my $global = shift;
+	my $ignore_case = shift;
+	my $save;
+
+	if (!$fix_regex1) {
+global_prompt:
+		print "Global (aka s/match/replace/g)? <Yes/No> [n] ";
+		$global = readline(STDIN); chomp $global;
+		$global = uc(substr($global,0,1));
+		if ($global eq '') {
+			$global = 'N';
+		}
+		$global = uc(substr($global,0,1));
+		if ($global ne 'Y' && $global ne 'N') {
+			goto global_prompt;
+		}
+case_prompt:
+		print "Ignore-case (aka s/match/replace/i)? <Yes/No> [n] ";
+		$ignore_case = readline(STDIN); chomp $ignore_case;
+		$ignore_case = uc(substr($ignore_case,0,1));
+		if ($ignore_case eq '') {
+			$ignore_case = 'N';
+		}
+		$ignore_case = uc(substr($ignore_case,0,1));
+		if ($ignore_case ne 'Y' && $ignore_case ne 'N') {
+			goto case_prompt;
+		}
+
+regex1_prompt:
+		print "Enter match regex for s/match/replace/: ";
+		$fix_regex1 = readline(STDIN); chomp $fix_regex1;
+		if ($fix_regex1 eq '') {
+			goto global_prompt;
+		}
+		if (
+			($global eq 'Y' && $ignore_case eq 'Y' && $line =~ /$fix_regex1/gi) 
+			|| ($global eq 'Y' && $ignore_case eq 'N' && $line =~ /$fix_regex1/g) 
+			|| ($global eq 'N' && $ignore_case eq 'N' && $line =~ /$fix_regex1/i) 
+		) {
+			print "Regex matches line.\n";
+		} else {
+			print "Regex does not match line.\n";
+			goto regex1_prompt;
+		}
+regex2_prompt:
+		print "Enter replace regex for s/match/replace/: ";
+		$fix_regex2 = readline(STDIN); chomp $fix_regex2;
+		if (substr($fix_regex1,-1) eq '$') {
+			print "Adding new line to end of /$fix_regex2/ based on \$ in /$fix_regex1/\n";
+			$fix_regex2 .= "\n";
+		}
+		# TODO - how to do we handle backreferences with this?
+		$save = 1;
+	}
+
+	switch ($global . $ignore_case) {
+		case 'YY'	{ $line =~ s/$fix_regex1/$fix_regex2/gi; }
+		case 'YN'	{ $line =~ s/$fix_regex1/$fix_regex2/g; }
+		case 'NY'	{ $line =~ s/$fix_regex1/$fix_regex2/i; }
+		case 'NN'	{ $line =~ s/$fix_regex1/$fix_regex2/; }
+	}
+
+	if ($save) {
+		save_fix('R',[
+			$id_regex,
+			$fix_regex1,
+			$fix_regex2,
+			$global,
+			$ignore_case
+		]);
+	}
+
+	return $line;
+}
+
+sub fix_via_insert {
+	my $id_regex = shift;
+	my $cols = shift;
+	my $col_count_check = shift;
+	my $fix_cols = shift;
+	my $fix_value = shift;
+	my $line;
+	my $save;
+
+	if (!$fix_cols) {
+		$col_count_check = scalar(@{$cols});
+		print "This fix will only trigger when the number of columns is $col_count_check.\n";
+		print "Enter value to insert: [] ";
+		$fix_value = readline(STDIN); chomp $fix_value;
+		print "Enter comma-separated list of column positions (0-based) for insertion: ";
+		$fix_cols = readline(STDIN); chomp $fix_cols;
+		$save = 1;
+	}
+
+	if ($col_count_check != scalar(@{$cols})) {
+		print "WARNING: Insert column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ".  Skipping.\n";
+		return $line;
+	}
+	
+	my @f = sort(split /,/, $fix_cols);
+	for (my $i = 0; $i < scalar(@f); $i++) {
+		splice @{ $cols }, $f[$i] + $i, 0, $fix_value;
+	}
+
+	eval {
+		$line = combine_cols($cols);
+	};
+	if ($@) {
+		print "fix_via_insert error:\n";
+		die $@;
+	}
+
+	if ($save) {
+		save_fix('I',[
+			$id_regex,
+			$col_count_check,
+			$fix_cols,
+			$fix_value
+		]);
+	}
+
+	return $line;
+}
+
+sub fix_via_delete {
+	my $id_regex = shift;
+	my $cols = shift;
+	my $col_count_check = shift;
+	my $fix_cols = shift;
+	my $line;
+	my $save;
+
+	if (!$fix_cols) {
+		$col_count_check = scalar(@{$cols});
+		print "This fix will only trigger when the number of columns is $col_count_check.\n";
+		print "Enter comma-separated list of column positions (0-based) to delete: ";
+		$fix_cols = readline(STDIN); chomp $fix_cols;
+		$save = 1;
+	}
+
+	if ($col_count_check != scalar(@{$cols})) {
+		print "WARNING: Delete column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ".  Skipping.\n";
+		return $line;
+	}
+	
+	my @f = sort(split /,/, $fix_cols);
+	for (my $i = 0; $i < scalar(@f); $i++) {
+		splice @{ $cols }, $f[$i] - $i, 1;
+	}
+
+	eval {
+		$line = combine_cols($cols);
+	};
+	if ($@) {
+		print "fix_via_delete error:\n";
+		die $@;
+	}
+
+	if ($save) {
+		save_fix('D',[
+			$id_regex,
+			$col_count_check,
+			$fix_cols
+		]);
+	}
+
+	return $line;
+}
+
+sub fix_via_join {
+	my $id_regex = shift;
+	my $cols = shift;
+	my $col_count_check = shift;
+	my $fix_cols = shift;
+	my $line;
+	my $save;
+
+	if (!$fix_cols) {
+		$col_count_check = scalar(@{$cols});
+		print "This fix will only trigger when the number of columns is $col_count_check.\n";
+		print "Enter comma-separated list of column positions (0-based) to join: ";
+		$fix_cols = readline(STDIN); chomp $fix_cols;
+		$save = 1;
+	}
+
+	if ($col_count_check != scalar(@{$cols})) {
+		print "WARNING: Join column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ".  Skipping.\n";
+		return $line;
+	}
+
+	# gather
+	my $fix_value = '';
+	my @f = sort { $a <=> $b } (split /,/, $fix_cols);
+	for (my $i = 0; $i < scalar(@f); $i++) {
+		$fix_value .= @{ $cols }[$f[$i]];
+	}
+
+	# delete	
+	for (my $i = 0; $i < scalar(@f); $i++) {
+		splice @{ $cols }, $f[$i] - $i, 1;
+	}
+
+	# insert
+	splice @{ $cols }, $f[0], 0, $fix_value;
+
+	eval {
+		$line = combine_cols($cols);
+	};
+	if ($@) {
+		print "fix_via_join error:\n";
+		die $@;
+	}
+
+	if ($save) {
+		save_fix('J',[
+			$id_regex,
+			$col_count_check,
+			$fix_cols
+		]);
+	}
+
+	return $line;
+}
+
+
+sub manual_fix {
+	my $line = shift;
+	my $cols = shift;
+	my $insert_delete_allowed = shift;
+	my $col_count = scalar(@{$cols}) > scalar(@headers) ? scalar(@{$cols}) : scalar(@headers);
+	my $max_header_length = 0;
+
+	# display columns nice and formatted
+	for (my $i = 0; $i < scalar(@headers); $i++) {
+		if (length($headers[$i]) > $max_header_length) {
+			$max_header_length = length($headers[$i]);
+		}
+	}
+	for (my $i = 0; $i < $col_count; $i++) {
+		printf "#% d ", $i;
+		if (defined $headers[$i]) {
+			print " " x ($max_header_length - length($headers[$i]));
+			print $headers[$i];
+		} else {
+			print " " x $max_header_length;
+		}
+		print ": ";
+		if (defined $cols->[$i]) {
+			print "<" . $cols->[$i] . ">";
+		}
+		print "\n";
+	}
+
+	# prompt for type of fix
+fix_prompt:
+	print "\n" . format_for_display($line) . "\nFix line#$line_no? <Regex" . ($insert_delete_allowed ? '|Insert|Delete|Join' : '') . "|No> [n] ";
+	my $ans = readline(STDIN); chomp $ans;
+	$ans = uc(substr($ans,0,1));
+	if ($ans eq '') {
+		$ans = 'N';
+	}
+	if ($insert_delete_allowed ? index("RIDJN",$ans)==-1 : index("RN",$ans)==-1) {
+		goto fix_prompt;
+	}
+
+	# prompt for matching condition
+	my $id_regex;
+	if ($ans ne 'N') {
+		my $default_id_regex = id_cols_regex($cols);
+match_prompt:
+		print "If matching the end of the string, you may need to use \\s*\$ instead of \$\n";
+		print "Insert/Delete/Join fixes will also filter on column count.\n";
+		print "Identify this line (and optionally similar lines) with regex: [$default_id_regex] ";
+		$id_regex = readline(STDIN); chomp $id_regex;
+		if ($id_regex eq '') {
+			$id_regex = $default_id_regex;
+		}
+		if ($line =~ /$id_regex/) {
+			print "Regex matches line.\n";
+		} else {
+			print "Regex does not match line.\n";
+			goto match_prompt;
+		}
+	}
+
+	# prompt and perform actual fixes
+	switch($ans) {
+		case 'R'	{ $line = fix_via_regex($id_regex,$line); }
+		case 'I'	{ $line = fix_via_insert($id_regex,$cols); }
+		case 'D'	{ $line = fix_via_delete($id_regex,$cols); }
+		case 'J'	{ $line = fix_via_join($id_regex,$cols); }
+		case 'N'	{ }
+		else		{ $ans = 'N'; }
+	}
+
+	if ($ans ne 'N') {
+		print "\nNew line#$line_no: $line";
+	}
+	return ( $ans, $line );
+}
+
+################################################################## Init
+GetOptions(
+	'config=s' => \$config,
+	'idcols=s' => \$id_cols,
+	'create-headers' => \$create_headers,
+	'use-headers=s' => \$headers_file,
+	'fix' => \$fix,
+	'nosave' => \$nosave,
+	'apply' => \$apply,
+	'pad' => \$pad,
+	'truncate' => \$truncate,
+	'backslash' => \$backslash,
+	'debug' => \$debug,
+	'help|?' => \$help
+);
+if ($help || ((@ARGV == 0) && (-t STDIN))) {
+	die "\n\t$0 [--config <CONFIG>] [--idcols <idx1,idx2,...>] [--fix] [--apply] [--pad] [--truncate] <FILE>\n\n"
+		. "\tExpects <FILE> to be a CSV-like UTF-8 encoded file.\n"
+		. "\tWill produce <FILE>.clean and <FILE>.error versions of said file.\n\n"
+		. "\t--config <CONFIG> will read the Perl file <CONFIG> for settings information.  See 'Example Config' below\n\n"
+		. "\t--create-headers will generate headers like so: col1, col2, col3, etc.\n"
+		. "\t--use-headers <HFILE> will generate headers based on the specified <HFILE>, which must contain one column header per line.\n"
+		. "\t(if neither --create-headers nor --use-headers are specified, then the first line in <FILE> is assumed to contain the column headers)\n\n"
+		. "\t--fix will prompt for whether and how to fix broken records, and save those fixes in <FILE>.fixes\n"
+		. "\t--idcols <idx1,idx2,...> takes a comma-separated list of column indexes (starting with 0) to use as matchpoint suggestions for fixes\n"
+		. "\t--nosave will prevent new fixes from being saved in <FILE>.fixes\n"
+		. "\t--apply will apply previously recorded fixes from <FILE>.fixes\n\n"
+		. "\t--pad will fill in missing columns at the end if needed for otherwise unbroken records\n"
+		. "\t--truncate will strip extra columns from the end if needed for otherwise unbroken records\n"
+		. "\t--backslash will convert backslashes into forward slashes\n\n"
+		. "\t Example Config:\n\n"
+		. "\t\t\$CSV_options{quote_char} = '\"';\n"
+		. "\t\t\$CSV_options{escape_char} = '\"';\n"
+		. "\t\t\$CSV_options{sep_char} = ',';\n"
+		. "\t\t\$CSV_options{eol} = \$\\;\n"
+		. "\t\t\$CSV_options{always_quote} = 0;\n"
+		. "\t\t\$CSV_options{quote_space} = 1;\n"
+		. "\t\t\$CSV_options{quote_null} = 1;\n"
+		. "\t\t\$CSV_options{quote_binary} = 1;\n"
+		. "\t\t\$CSV_options{binary} = 0;\n"
+		. "\t\t\$CSV_options{decode_utf8} = 1;\n"
+		. "\t\t\$CSV_options{keep_meta_info} = 0;\n"
+		. "\t\t\$CSV_options{allow_loose_quotes} = 0;\n"
+		. "\t\t\$CSV_options{allow_loose_escapes} = 0;\n"
+		. "\t\t\$CSV_options{allow_unquoted_escape} = 0;\n"
+		. "\t\t\$CSV_options{allow_whitespace} = 0;\n"
+		. "\t\t\$CSV_options{blank_is_undef} = 0;\n"
+		. "\t\t\$CSV_options{empty_is_undef} = 0;\n"
+		. "\t\t\$CSV_options{verbatim} = 0;\n"
+		. "\n\n";
+}
+if (! -e $ARGV[0]) {
+	die "$ARGV[0] does not exist\n";
+}
+if ($config && ! -e $config) {
+	die "$config does not exist\n";
+}
+if ($apply && -e $ARGV[0] . '.fixes') {
+	$fixes = retrieve($ARGV[0] . '.fixes');
+}
+
+################################################################## CSV Setup
+$CSV_options{sep_char} = get_separator( path => $ARGV[0], lucky => 1 );
+if ($config && -e $config) {
+	do $config;
+}
+$csv = Text::CSV_XS->new(\%CSV_options);
+$csv->callbacks(
+    error => sub {
+        my ($err, $msg, $pos, $recno) = @_;
+        return if ($err == 2012);
+		$line_numbers_for_lines_with_errors{$line_no} = 1;
+        print "\nline#$line_no * $err : $msg -> (pos#$pos,rec#$recno)\n";
+        if ($csv->error_input) {
+            print $csv->error_input;
+            print "-" x ($pos - 1);
+            print "^\n";
+        }
+        $csv->SetDiag(0);
+        return;
+    }
+);
+
+################################################################## Reading
+
+if ($headers_file) {
+	print "_.,-~= reading $headers_file\n";
+	open my $hfile, "<:encoding(utf8)", $headers_file or die "$headers_file: $!";
+	while (my $line = <$hfile>) {
+		chomp $line;
+		$line =~ s/\s+$//;
+		$line =~ s/^\s+//;
+		push @headers, $line;
+	}
+	close $hfile;
+	$expected_column_count = scalar(@headers);
+	print "Expected column count set to $expected_column_count based on headers.\n";
+}
+
+print "_.,-~= reading $ARGV[0]\n";
+open my $in, "<:encoding(utf8)", $ARGV[0] or die "$ARGV[0]: $!";
+$line_no = 1; 
+while (my $line = <$in>) {
+	print ">>> main loop (#$line_no): $line" if $debug;
+	if ($backslash) {
+		$line = convert_backslashes($line);
+	}
+	if ($apply) {
+		$line = apply_line_fixes($line);
+	}
+	if ($csv->parse($line)) {
+		my @columns = $csv->fields();
+		if (! $expected_column_count) {
+			$expected_column_count = scalar(@columns);
+			print "Expected column count set to $expected_column_count based on first row.\n";
+			for (my $i = 0; $i < scalar(@columns) ; $i++) {
+				if ($create_headers) {
+					push @headers, "col" . ($i+1);
+				} else {
+					push @headers, $columns[$i];
+				}
+			}
+		}
+		if (defined $line_numbers_for_lines_with_errors{$line_no}) {
+			if ($fix) {
+				my $fix_status;
+				($fix_status,$line) = manual_fix($line,\@columns,0); # Regex only
+				if ($fix_status ne 'N') {
+					delete $line_numbers_for_lines_with_errors{$line_no};
+					$fix_count++;
+					redo;
+				}
+			}
+		} else {
+			if (scalar(@columns) < $expected_column_count) {
+				if ($apply) {
+					my $new_line = apply_insert_fixes($line,\@columns);
+					if ($line ne $new_line) {
+						$line = $new_line;
+						redo;
+					}
+				}
+				if ($pad) {
+					$pad_count++;
+					print "\nline#$line_no>> padding line, from " . scalar(@columns) . " columns ";
+					my $col_count = scalar(@columns);
+					for (my $i = 0; $i < $expected_column_count - $col_count; $i++) {
+						push @columns, '#pad#';
+					}
+					print "to " . scalar(@columns) . " columns.\n";
+					eval {
+						print "before: " . format_for_display($line);
+						$line = combine_cols(\@columns);
+						print " after: " . format_for_display($line);
+					};
+					if ($@) {
+						print "padding error:\n";
+						die $@;
+					}
+					redo;
+				}
+			}
+			if (scalar(@columns) > $expected_column_count) {
+				if ($apply) {
+					my $new_line = apply_delete_fixes($line,\@columns);
+					if ($line ne $new_line) {
+						$line = $new_line;
+						redo;
+					}
+					$new_line = apply_join_fixes($line,\@columns);
+					if ($line ne $new_line) {
+						$line = $new_line;
+						redo;
+					}
+				}
+				if ($truncate) {
+					$trunc_count++;
+					print "\nline#$line_no>> truncating line, from " . scalar(@columns) . " columns ";
+					splice @columns, $expected_column_count;
+					print "to " . scalar(@columns) . " columns.\n";
+					eval {
+						print "before: " . format_for_display($line);
+						$line = combine_cols(\@columns);
+						print " after: " . format_for_display($line);
+					};
+					if ($@) {
+						print "truncating error:\n";
+						die $@;
+					}
+					redo;
+				}
+			}
+			if (scalar(@columns) != $expected_column_count) {
+				# so broken, but parseable, and thus not handled by the error callback
+				print "\nline#$line_no * Expected $expected_column_count columns but found " . scalar(@columns) . "\n$line";
+				print "-" x length($line) . "\n";
+				$line_numbers_for_lines_with_errors{$line_no} = 1;
+				if ($fix) {
+					my $fix_status;
+					($fix_status,$line) = manual_fix($line,\@columns,1); # Insert/Delete allowed
+					if ($fix_status ne 'N') {
+						delete $line_numbers_for_lines_with_errors{$line_no};
+						$fix_count++;
+						redo;
+					}
+				}
+			}
+		}
+		if (defined $line_numbers_for_lines_with_errors{$line_no}) {
+			print "\tIncrementing errors with line# $line_no\n" if $debug;
+			push @lines_with_errors, $line;
+		} else {
+			print "\tIncrementing clean with line# $line_no\n" if $debug;
+			push @parsed_rows, \@columns;
+		}
+		$line_no++;
+	} else {
+		die "Parsing error:\n" . $csv->error_input . "\n";
+	}
+}
+close $in;
+print "_.,-~= read " . ($line_no-1) . " records ";
+print "(" . scalar(@lines_with_errors) . " with errors, $pad_count auto-padded, $trunc_count auto-truncated, $backslash_count backslashes converted, $fix_count manual fixes)\n";
+
+
+################################################################## Writing good CSV
+
+print "_.,-~= writing $ARGV[0].clean\n";
+open my $out, ">:encoding(utf8)", "$ARGV[0].clean" or die "$ARGV[0].clean: $!";
+$line_no = 1;
+$actual_count = 0;
+if ($create_headers || $headers_file) {
+	unshift @parsed_rows, \@headers;
+}
+foreach my $row (@parsed_rows) {
+	eval {
+		$line = combine_cols($row);
+		print $out $line;
+	};
+	if ($@) {
+		print "error:\n";
+		die $@;
+	}
+	$actual_count++;
+}
+close $out;
+print "_.,-~= wrote " . ($actual_count) . " records\n";
+
+
+################################################################## Writing broken CSV
+
+print "_.,-~= writing $ARGV[0].error\n";
+open my $out2, ">:encoding(utf8)", "$ARGV[0].error" or die "$ARGV[0].error: $!";
+foreach my $row (@lines_with_errors) {
+	print $out2 $row;
+}
+close $out2;
+print "_.,-~= wrote " . (scalar @lines_with_errors) . " records\n";
+
+
+################################################################## .no_headers version
+
+print "_.,-~= creating $ARGV[0].clean.no_headers\n";
+
+print `tail -n +2 $ARGV[0].clean > $ARGV[0].clean.no_headers`;
+
+################################################################## Finished
+
+print "_.,-~= finished\n";