--- /dev/null
+#!/usr/bin/perl -w
+use Storable;
+use Switch;
+use Getopt::Long;
+use Text::CSV_XS;
+use Text::CSV::Separator qw(get_separator);
+use Data::Dumper;
+use Term::ANSIColor;
+
+# may be manipulated with --config
+our %CSV_options = (
+ binary => 1,
+ auto_diag => 1,
+ diag_verbose => 1,
+);
+my $csv;
+our $fixes = {
+ 'R' => [],
+ 'I' => [],
+ 'D' => []
+};
+my @parsed_rows;
+my @lines_with_errors = ();
+my %line_numbers_for_lines_with_errors = ();
+my $expected_column_count;
+my $line_no;
+
+# GetOpt variables
+my $config;
+my $id_cols;
+my $fix;
+my $nosave;
+my $apply;
+my $pad;
+my $truncate;
+my $backslash;
+my $debug;
+my $help;
+my $create_headers;
+my $headers_file;
+my @headers = ();
+
+my $pad_count = 0; my $trunc_count = 0; my $fix_count = 0; my $backslash_count = 0;
+
+################################################################## Subs
+
+sub format_for_display {
+ my $formatted_line = shift;
+ my $sep_char = $CSV_options{sep_char} || '\t';
+ my $sep = color 'bold blue';
+ $sep .= '<' . (ord($sep_char) < 32 ? ord($sep_char) : $sep_char) . '>';
+ $sep .= color 'reset';
+ my $quote_char = $CSV_options{quote_char} || '';
+ my $quote = color 'bold red';
+ $quote .= '<' . (ord($quote_char) < 32 ? ord($quote_char) : $quote_char) . '>';
+ $quote .= color 'reset';
+ my $escape_char = $CSV_options{escape_char} || '';
+ my $escape = color 'bold green';
+ $escape .= '<' . (ord($escape_char) < 32 ? ord($escape_char) : $escape_char) . '>';
+ $escape .= color 'reset';
+ my $real_escape_char = chr(27);
+ my $real_escape = color 'bold green';
+ $real_escape .= '<27>';
+ $real_escape .= color 'reset';
+
+ $formatted_line =~ s/$real_escape_char/$real_escape/g;
+ $formatted_line =~ s/$sep_char/$sep/g;
+ $formatted_line =~ s/$quote_char/$quote/g;
+ $formatted_line =~ s/$escape_char/$escape/g;
+ for (my $i = 0; $i < 32; $i++) {
+ if ($i == 27) { next; }
+ my $other_char = chr($i);
+ my $other = color 'yellow';
+ $other .= "<$i>";
+ $other .= color 'reset';
+ $formatted_line =~ s/$other_char/$other/g;
+ }
+ return "$formatted_line\n";
+}
+
+sub combine_cols {
+ my $row = shift;
+ my $status = $csv->combine(@{ $row });
+ if ($status && $csv->string) {
+ return $csv->string . "\n";
+ } else {
+ die $csv->error_input . "\n";
+ }
+}
+
+sub convert_backslashes {
+ my $line = shift;
+ my $altered_line;
+ my @count = $line =~ /\\/g;
+ if (scalar(@count) > 0) {
+ my $csv2 = Text::CSV_XS->new(\%CSV_options);
+ if ($csv2->parse($line)) {
+ my @columns = $csv2->fields();
+ foreach my $c (@columns) {
+ if ($c ne '\N') {
+ $c =~ s/\\/\//g;
+ }
+ }
+ $altered_line = combine_cols(\@columns);
+ } else {
+ $altered_line =~ s/\\/\//g;
+ }
+ if ($line ne $altered_line) {
+ $backslash_count += scalar(@count);
+ print "\nline#$line_no>> Converting " . scalar(@count) . " backslashes to forward slashes\n";
+ print "before: " . format_for_display($line);
+ $line = $altered_line;
+ print " after: " . format_for_display($line);
+ }
+ }
+ return $line;
+}
+
+sub apply_line_fixes {
+ my $line = shift;
+ foreach my $fix ( @{$fixes->{'R'}} ) {
+ my $id_regex = $fix->[0];
+ if ($line =~ /$id_regex/) {
+ print "\nline#$line_no>> Applying regex fix for $id_regex\n";
+ $fix_count++;
+ my $regex1 = $fix->[1];
+ my $regex2 = $fix->[2];
+ my $global = $fix->[3];
+ my $ignore_case = $fix->[4];
+ print "before: " . format_for_display($line);
+ $line = fix_via_regex($id_regex,$line,$regex1,$regex2,$global,$ignore_case);
+ print " after: " . format_for_display($line);
+ }
+ }
+ return $line;
+}
+
+sub apply_insert_fixes {
+ my $line = shift;
+ my $cols = shift;
+ foreach my $fix ( @{$fixes->{'I'}} ) {
+ my $id_regex = $fix->[0];
+ my $col_count_check = $fix->[1];
+ if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
+ print "\nline#$line_no>> Applying insert for $id_regex\n";
+ $fix_count++;
+ my $fix_cols = $fix->[2];
+ my $fix_value = $fix->[3];
+ print "before: " . format_for_display($line);
+ $line = fix_via_insert($id_regex,$cols,$col_count_check,$fix_cols,$fix_value);
+ print " after: " . format_for_display($line);
+ }
+ }
+ return $line;
+}
+
+sub apply_delete_fixes {
+ my $line = shift;
+ my $cols = shift;
+ foreach my $fix ( @{$fixes->{'D'}} ) {
+ my $id_regex = $fix->[0];
+ my $col_count_check = $fix->[1];
+ if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
+ print "\nline#$line_no>> Applying delete for $id_regex\n";
+ $fix_count++;
+ my $fix_cols = $fix->[2];
+ print "before: " . format_for_display($line);
+ $line = fix_via_delete($id_regex,$cols,$col_count_check,$fix_cols);
+ print " after: " . format_for_display($line);
+ }
+ }
+ return $line;
+}
+
+sub apply_join_fixes {
+ my $line = shift;
+ my $cols = shift;
+ foreach my $fix ( @{$fixes->{'J'}} ) {
+ my $id_regex = $fix->[0];
+ my $col_count_check = $fix->[1];
+ if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
+ print "\nline#$line_no>> Applying join for $id_regex\n";
+ $fix_count++;
+ my $fix_cols = $fix->[2];
+ print "before: " . format_for_display($line);
+ $line = fix_via_join($id_regex,$cols,$col_count_check,$fix_cols);
+ print " after: " . format_for_display($line);
+ }
+ }
+ return $line;
+}
+
+sub save_fix {
+ print "saving fix...";
+ my $type = shift;
+ my $fix = shift;
+ if ($nosave) { print "psyche!\n"; return; }
+ print "fix = " . Dumper($fix) . "\n" if $debug;
+ push @{$fixes->{$type}}, $fix;
+ store $fixes, $ARGV[0] . '.fixes';
+ print "saved\n";
+}
+
+sub id_cols_regex {
+ my $cols = shift;
+ my @f = sort(split /,/, $id_cols || '0');
+ my $regex = '';
+ for (my $i = 0; $i < scalar(@f); $i++) {
+ if ($i > 0) {
+ $regex .= '.+'; # characters between id columns
+ }
+ $regex .= '.?' . $cols->[$f[$i]] . '.?';
+ }
+ $regex .= '';
+ return $regex;
+}
+
+sub fix_via_regex {
+ my $id_regex = shift;
+ my $line = shift;
+ my $fix_regex1 = shift;
+ my $fix_regex2 = shift;
+ my $global = shift;
+ my $ignore_case = shift;
+ my $save;
+
+ if (!$fix_regex1) {
+global_prompt:
+ print "Global (aka s/match/replace/g)? <Yes/No> [n] ";
+ $global = readline(STDIN); chomp $global;
+ $global = uc(substr($global,0,1));
+ if ($global eq '') {
+ $global = 'N';
+ }
+ $global = uc(substr($global,0,1));
+ if ($global ne 'Y' && $global ne 'N') {
+ goto global_prompt;
+ }
+case_prompt:
+ print "Ignore-case (aka s/match/replace/i)? <Yes/No> [n] ";
+ $ignore_case = readline(STDIN); chomp $ignore_case;
+ $ignore_case = uc(substr($ignore_case,0,1));
+ if ($ignore_case eq '') {
+ $ignore_case = 'N';
+ }
+ $ignore_case = uc(substr($ignore_case,0,1));
+ if ($ignore_case ne 'Y' && $ignore_case ne 'N') {
+ goto case_prompt;
+ }
+
+regex1_prompt:
+ print "Enter match regex for s/match/replace/: ";
+ $fix_regex1 = readline(STDIN); chomp $fix_regex1;
+ if ($fix_regex1 eq '') {
+ goto global_prompt;
+ }
+ if (
+ ($global eq 'Y' && $ignore_case eq 'Y' && $line =~ /$fix_regex1/gi)
+ || ($global eq 'Y' && $ignore_case eq 'N' && $line =~ /$fix_regex1/g)
+ || ($global eq 'N' && $ignore_case eq 'N' && $line =~ /$fix_regex1/i)
+ ) {
+ print "Regex matches line.\n";
+ } else {
+ print "Regex does not match line.\n";
+ goto regex1_prompt;
+ }
+regex2_prompt:
+ print "Enter replace regex for s/match/replace/: ";
+ $fix_regex2 = readline(STDIN); chomp $fix_regex2;
+ if (substr($fix_regex1,-1) eq '$') {
+ print "Adding new line to end of /$fix_regex2/ based on \$ in /$fix_regex1/\n";
+ $fix_regex2 .= "\n";
+ }
+ # TODO - how to do we handle backreferences with this?
+ $save = 1;
+ }
+
+ switch ($global . $ignore_case) {
+ case 'YY' { $line =~ s/$fix_regex1/$fix_regex2/gi; }
+ case 'YN' { $line =~ s/$fix_regex1/$fix_regex2/g; }
+ case 'NY' { $line =~ s/$fix_regex1/$fix_regex2/i; }
+ case 'NN' { $line =~ s/$fix_regex1/$fix_regex2/; }
+ }
+
+ if ($save) {
+ save_fix('R',[
+ $id_regex,
+ $fix_regex1,
+ $fix_regex2,
+ $global,
+ $ignore_case
+ ]);
+ }
+
+ return $line;
+}
+
+sub fix_via_insert {
+ my $id_regex = shift;
+ my $cols = shift;
+ my $col_count_check = shift;
+ my $fix_cols = shift;
+ my $fix_value = shift;
+ my $line;
+ my $save;
+
+ if (!$fix_cols) {
+ $col_count_check = scalar(@{$cols});
+ print "This fix will only trigger when the number of columns is $col_count_check.\n";
+ print "Enter value to insert: [] ";
+ $fix_value = readline(STDIN); chomp $fix_value;
+ print "Enter comma-separated list of column positions (0-based) for insertion: ";
+ $fix_cols = readline(STDIN); chomp $fix_cols;
+ $save = 1;
+ }
+
+ if ($col_count_check != scalar(@{$cols})) {
+ print "WARNING: Insert column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ". Skipping.\n";
+ return $line;
+ }
+
+ my @f = sort(split /,/, $fix_cols);
+ for (my $i = 0; $i < scalar(@f); $i++) {
+ splice @{ $cols }, $f[$i] + $i, 0, $fix_value;
+ }
+
+ eval {
+ $line = combine_cols($cols);
+ };
+ if ($@) {
+ print "fix_via_insert error:\n";
+ die $@;
+ }
+
+ if ($save) {
+ save_fix('I',[
+ $id_regex,
+ $col_count_check,
+ $fix_cols,
+ $fix_value
+ ]);
+ }
+
+ return $line;
+}
+
+sub fix_via_delete {
+ my $id_regex = shift;
+ my $cols = shift;
+ my $col_count_check = shift;
+ my $fix_cols = shift;
+ my $line;
+ my $save;
+
+ if (!$fix_cols) {
+ $col_count_check = scalar(@{$cols});
+ print "This fix will only trigger when the number of columns is $col_count_check.\n";
+ print "Enter comma-separated list of column positions (0-based) to delete: ";
+ $fix_cols = readline(STDIN); chomp $fix_cols;
+ $save = 1;
+ }
+
+ if ($col_count_check != scalar(@{$cols})) {
+ print "WARNING: Delete column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ". Skipping.\n";
+ return $line;
+ }
+
+ my @f = sort(split /,/, $fix_cols);
+ for (my $i = 0; $i < scalar(@f); $i++) {
+ splice @{ $cols }, $f[$i] - $i, 1;
+ }
+
+ eval {
+ $line = combine_cols($cols);
+ };
+ if ($@) {
+ print "fix_via_delete error:\n";
+ die $@;
+ }
+
+ if ($save) {
+ save_fix('D',[
+ $id_regex,
+ $col_count_check,
+ $fix_cols
+ ]);
+ }
+
+ return $line;
+}
+
+sub fix_via_join {
+ my $id_regex = shift;
+ my $cols = shift;
+ my $col_count_check = shift;
+ my $fix_cols = shift;
+ my $line;
+ my $save;
+
+ if (!$fix_cols) {
+ $col_count_check = scalar(@{$cols});
+ print "This fix will only trigger when the number of columns is $col_count_check.\n";
+ print "Enter comma-separated list of column positions (0-based) to join: ";
+ $fix_cols = readline(STDIN); chomp $fix_cols;
+ $save = 1;
+ }
+
+ if ($col_count_check != scalar(@{$cols})) {
+ print "WARNING: Join column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ". Skipping.\n";
+ return $line;
+ }
+
+ # gather
+ my $fix_value = '';
+ my @f = sort { $a <=> $b } (split /,/, $fix_cols);
+ for (my $i = 0; $i < scalar(@f); $i++) {
+ $fix_value .= @{ $cols }[$f[$i]];
+ }
+
+ # delete
+ for (my $i = 0; $i < scalar(@f); $i++) {
+ splice @{ $cols }, $f[$i] - $i, 1;
+ }
+
+ # insert
+ splice @{ $cols }, $f[0], 0, $fix_value;
+
+ eval {
+ $line = combine_cols($cols);
+ };
+ if ($@) {
+ print "fix_via_join error:\n";
+ die $@;
+ }
+
+ if ($save) {
+ save_fix('J',[
+ $id_regex,
+ $col_count_check,
+ $fix_cols
+ ]);
+ }
+
+ return $line;
+}
+
+
+sub manual_fix {
+ my $line = shift;
+ my $cols = shift;
+ my $insert_delete_allowed = shift;
+ my $col_count = scalar(@{$cols}) > scalar(@headers) ? scalar(@{$cols}) : scalar(@headers);
+ my $max_header_length = 0;
+
+ # display columns nice and formatted
+ for (my $i = 0; $i < scalar(@headers); $i++) {
+ if (length($headers[$i]) > $max_header_length) {
+ $max_header_length = length($headers[$i]);
+ }
+ }
+ for (my $i = 0; $i < $col_count; $i++) {
+ printf "#% d ", $i;
+ if (defined $headers[$i]) {
+ print " " x ($max_header_length - length($headers[$i]));
+ print $headers[$i];
+ } else {
+ print " " x $max_header_length;
+ }
+ print ": ";
+ if (defined $cols->[$i]) {
+ print "<" . $cols->[$i] . ">";
+ }
+ print "\n";
+ }
+
+ # prompt for type of fix
+fix_prompt:
+ print "\n" . format_for_display($line) . "\nFix line#$line_no? <Regex" . ($insert_delete_allowed ? '|Insert|Delete|Join' : '') . "|No> [n] ";
+ my $ans = readline(STDIN); chomp $ans;
+ $ans = uc(substr($ans,0,1));
+ if ($ans eq '') {
+ $ans = 'N';
+ }
+ if ($insert_delete_allowed ? index("RIDJN",$ans)==-1 : index("RN",$ans)==-1) {
+ goto fix_prompt;
+ }
+
+ # prompt for matching condition
+ my $id_regex;
+ if ($ans ne 'N') {
+ my $default_id_regex = id_cols_regex($cols);
+match_prompt:
+ print "If matching the end of the string, you may need to use \\s*\$ instead of \$\n";
+ print "Insert/Delete/Join fixes will also filter on column count.\n";
+ print "Identify this line (and optionally similar lines) with regex: [$default_id_regex] ";
+ $id_regex = readline(STDIN); chomp $id_regex;
+ if ($id_regex eq '') {
+ $id_regex = $default_id_regex;
+ }
+ if ($line =~ /$id_regex/) {
+ print "Regex matches line.\n";
+ } else {
+ print "Regex does not match line.\n";
+ goto match_prompt;
+ }
+ }
+
+ # prompt and perform actual fixes
+ switch($ans) {
+ case 'R' { $line = fix_via_regex($id_regex,$line); }
+ case 'I' { $line = fix_via_insert($id_regex,$cols); }
+ case 'D' { $line = fix_via_delete($id_regex,$cols); }
+ case 'J' { $line = fix_via_join($id_regex,$cols); }
+ case 'N' { }
+ else { $ans = 'N'; }
+ }
+
+ if ($ans ne 'N') {
+ print "\nNew line#$line_no: $line";
+ }
+ return ( $ans, $line );
+}
+
+################################################################## Init
+GetOptions(
+ 'config=s' => \$config,
+ 'idcols=s' => \$id_cols,
+ 'create-headers' => \$create_headers,
+ 'use-headers=s' => \$headers_file,
+ 'fix' => \$fix,
+ 'nosave' => \$nosave,
+ 'apply' => \$apply,
+ 'pad' => \$pad,
+ 'truncate' => \$truncate,
+ 'backslash' => \$backslash,
+ 'debug' => \$debug,
+ 'help|?' => \$help
+);
+if ($help || ((@ARGV == 0) && (-t STDIN))) {
+ die "\n\t$0 [--config <CONFIG>] [--idcols <idx1,idx2,...>] [--fix] [--apply] [--pad] [--truncate] <FILE>\n\n"
+ . "\tExpects <FILE> to be a CSV-like UTF-8 encoded file.\n"
+ . "\tWill produce <FILE>.clean and <FILE>.error versions of said file.\n\n"
+ . "\t--config <CONFIG> will read the Perl file <CONFIG> for settings information. See 'Example Config' below\n\n"
+ . "\t--create-headers will generate headers like so: col1, col2, col3, etc.\n"
+ . "\t--use-headers <HFILE> will generate headers based on the specified <HFILE>, which must contain one column header per line.\n"
+ . "\t(if neither --create-headers nor --use-headers are specified, then the first line in <FILE> is assumed to contain the column headers)\n\n"
+ . "\t--fix will prompt for whether and how to fix broken records, and save those fixes in <FILE>.fixes\n"
+ . "\t--idcols <idx1,idx2,...> takes a comma-separated list of column indexes (starting with 0) to use as matchpoint suggestions for fixes\n"
+ . "\t--nosave will prevent new fixes from being saved in <FILE>.fixes\n"
+ . "\t--apply will apply previously recorded fixes from <FILE>.fixes\n\n"
+ . "\t--pad will fill in missing columns at the end if needed for otherwise unbroken records\n"
+ . "\t--truncate will strip extra columns from the end if needed for otherwise unbroken records\n"
+ . "\t--backslash will convert backslashes into forward slashes\n\n"
+ . "\t Example Config:\n\n"
+ . "\t\t\$CSV_options{quote_char} = '\"';\n"
+ . "\t\t\$CSV_options{escape_char} = '\"';\n"
+ . "\t\t\$CSV_options{sep_char} = ',';\n"
+ . "\t\t\$CSV_options{eol} = \$\\;\n"
+ . "\t\t\$CSV_options{always_quote} = 0;\n"
+ . "\t\t\$CSV_options{quote_space} = 1;\n"
+ . "\t\t\$CSV_options{quote_null} = 1;\n"
+ . "\t\t\$CSV_options{quote_binary} = 1;\n"
+ . "\t\t\$CSV_options{binary} = 0;\n"
+ . "\t\t\$CSV_options{decode_utf8} = 1;\n"
+ . "\t\t\$CSV_options{keep_meta_info} = 0;\n"
+ . "\t\t\$CSV_options{allow_loose_quotes} = 0;\n"
+ . "\t\t\$CSV_options{allow_loose_escapes} = 0;\n"
+ . "\t\t\$CSV_options{allow_unquoted_escape} = 0;\n"
+ . "\t\t\$CSV_options{allow_whitespace} = 0;\n"
+ . "\t\t\$CSV_options{blank_is_undef} = 0;\n"
+ . "\t\t\$CSV_options{empty_is_undef} = 0;\n"
+ . "\t\t\$CSV_options{verbatim} = 0;\n"
+ . "\n\n";
+}
+if (! -e $ARGV[0]) {
+ die "$ARGV[0] does not exist\n";
+}
+if ($config && ! -e $config) {
+ die "$config does not exist\n";
+}
+if ($apply && -e $ARGV[0] . '.fixes') {
+ $fixes = retrieve($ARGV[0] . '.fixes');
+}
+
+################################################################## CSV Setup
+$CSV_options{sep_char} = get_separator( path => $ARGV[0], lucky => 1 );
+if ($config && -e $config) {
+ do $config;
+}
+$csv = Text::CSV_XS->new(\%CSV_options);
+$csv->callbacks(
+ error => sub {
+ my ($err, $msg, $pos, $recno) = @_;
+ return if ($err == 2012);
+ $line_numbers_for_lines_with_errors{$line_no} = 1;
+ print "\nline#$line_no * $err : $msg -> (pos#$pos,rec#$recno)\n";
+ if ($csv->error_input) {
+ print $csv->error_input;
+ print "-" x ($pos - 1);
+ print "^\n";
+ }
+ $csv->SetDiag(0);
+ return;
+ }
+);
+
+################################################################## Reading
+
+if ($headers_file) {
+ print "_.,-~= reading $headers_file\n";
+ open my $hfile, "<:encoding(utf8)", $headers_file or die "$headers_file: $!";
+ while (my $line = <$hfile>) {
+ chomp $line;
+ $line =~ s/\s+$//;
+ $line =~ s/^\s+//;
+ push @headers, $line;
+ }
+ close $hfile;
+ $expected_column_count = scalar(@headers);
+ print "Expected column count set to $expected_column_count based on headers.\n";
+}
+
+print "_.,-~= reading $ARGV[0]\n";
+open my $in, "<:encoding(utf8)", $ARGV[0] or die "$ARGV[0]: $!";
+$line_no = 1;
+while (my $line = <$in>) {
+ print ">>> main loop (#$line_no): $line" if $debug;
+ if ($backslash) {
+ $line = convert_backslashes($line);
+ }
+ if ($apply) {
+ $line = apply_line_fixes($line);
+ }
+ if ($csv->parse($line)) {
+ my @columns = $csv->fields();
+ if (! $expected_column_count) {
+ $expected_column_count = scalar(@columns);
+ print "Expected column count set to $expected_column_count based on first row.\n";
+ for (my $i = 0; $i < scalar(@columns) ; $i++) {
+ if ($create_headers) {
+ push @headers, "col" . ($i+1);
+ } else {
+ push @headers, $columns[$i];
+ }
+ }
+ }
+ if (defined $line_numbers_for_lines_with_errors{$line_no}) {
+ if ($fix) {
+ my $fix_status;
+ ($fix_status,$line) = manual_fix($line,\@columns,0); # Regex only
+ if ($fix_status ne 'N') {
+ delete $line_numbers_for_lines_with_errors{$line_no};
+ $fix_count++;
+ redo;
+ }
+ }
+ } else {
+ if (scalar(@columns) < $expected_column_count) {
+ if ($apply) {
+ my $new_line = apply_insert_fixes($line,\@columns);
+ if ($line ne $new_line) {
+ $line = $new_line;
+ redo;
+ }
+ }
+ if ($pad) {
+ $pad_count++;
+ print "\nline#$line_no>> padding line, from " . scalar(@columns) . " columns ";
+ my $col_count = scalar(@columns);
+ for (my $i = 0; $i < $expected_column_count - $col_count; $i++) {
+ push @columns, '#pad#';
+ }
+ print "to " . scalar(@columns) . " columns.\n";
+ eval {
+ print "before: " . format_for_display($line);
+ $line = combine_cols(\@columns);
+ print " after: " . format_for_display($line);
+ };
+ if ($@) {
+ print "padding error:\n";
+ die $@;
+ }
+ redo;
+ }
+ }
+ if (scalar(@columns) > $expected_column_count) {
+ if ($apply) {
+ my $new_line = apply_delete_fixes($line,\@columns);
+ if ($line ne $new_line) {
+ $line = $new_line;
+ redo;
+ }
+ $new_line = apply_join_fixes($line,\@columns);
+ if ($line ne $new_line) {
+ $line = $new_line;
+ redo;
+ }
+ }
+ if ($truncate) {
+ $trunc_count++;
+ print "\nline#$line_no>> truncating line, from " . scalar(@columns) . " columns ";
+ splice @columns, $expected_column_count;
+ print "to " . scalar(@columns) . " columns.\n";
+ eval {
+ print "before: " . format_for_display($line);
+ $line = combine_cols(\@columns);
+ print " after: " . format_for_display($line);
+ };
+ if ($@) {
+ print "truncating error:\n";
+ die $@;
+ }
+ redo;
+ }
+ }
+ if (scalar(@columns) != $expected_column_count) {
+ # so broken, but parseable, and thus not handled by the error callback
+ print "\nline#$line_no * Expected $expected_column_count columns but found " . scalar(@columns) . "\n$line";
+ print "-" x length($line) . "\n";
+ $line_numbers_for_lines_with_errors{$line_no} = 1;
+ if ($fix) {
+ my $fix_status;
+ ($fix_status,$line) = manual_fix($line,\@columns,1); # Insert/Delete allowed
+ if ($fix_status ne 'N') {
+ delete $line_numbers_for_lines_with_errors{$line_no};
+ $fix_count++;
+ redo;
+ }
+ }
+ }
+ }
+ if (defined $line_numbers_for_lines_with_errors{$line_no}) {
+ print "\tIncrementing errors with line# $line_no\n" if $debug;
+ push @lines_with_errors, $line;
+ } else {
+ print "\tIncrementing clean with line# $line_no\n" if $debug;
+ push @parsed_rows, \@columns;
+ }
+ $line_no++;
+ } else {
+ die "Parsing error:\n" . $csv->error_input . "\n";
+ }
+}
+close $in;
+print "_.,-~= read " . ($line_no-1) . " records ";
+print "(" . scalar(@lines_with_errors) . " with errors, $pad_count auto-padded, $trunc_count auto-truncated, $backslash_count backslashes converted, $fix_count manual fixes)\n";
+
+
+################################################################## Writing good CSV
+
+print "_.,-~= writing $ARGV[0].clean\n";
+open my $out, ">:encoding(utf8)", "$ARGV[0].clean" or die "$ARGV[0].clean: $!";
+$line_no = 1;
+$actual_count = 0;
+if ($create_headers || $headers_file) {
+ unshift @parsed_rows, \@headers;
+}
+foreach my $row (@parsed_rows) {
+ eval {
+ $line = combine_cols($row);
+ print $out $line;
+ };
+ if ($@) {
+ print "error:\n";
+ die $@;
+ }
+ $actual_count++;
+}
+close $out;
+print "_.,-~= wrote " . ($actual_count) . " records\n";
+
+
+################################################################## Writing broken CSV
+
+print "_.,-~= writing $ARGV[0].error\n";
+open my $out2, ">:encoding(utf8)", "$ARGV[0].error" or die "$ARGV[0].error: $!";
+foreach my $row (@lines_with_errors) {
+ print $out2 $row;
+}
+close $out2;
+print "_.,-~= wrote " . (scalar @lines_with_errors) . " records\n";
+
+
+################################################################## .no_headers version
+
+print "_.,-~= creating $ARGV[0].clean.no_headers\n";
+
+print `tail -n +2 $ARGV[0].clean > $ARGV[0].clean.no_headers`;
+
+################################################################## Finished
+
+print "_.,-~= finished\n";