6 use Text::CSV::Separator qw(get_separator);
10 # may be manipulated with --config
23 my @lines_with_errors = ();
24 my %line_numbers_for_lines_with_errors = ();
25 my $expected_column_count;
43 my $pad_count = 0; my $trunc_count = 0; my $fix_count = 0; my $backslash_count = 0;
45 ################################################################## Subs
47 sub format_for_display {
48 my $formatted_line = shift;
49 my $sep_char = $CSV_options{sep_char} || '\t';
50 my $sep = color 'bold blue';
51 $sep .= '<' . (ord($sep_char) < 32 ? ord($sep_char) : $sep_char) . '>';
52 $sep .= color 'reset';
53 my $quote_char = $CSV_options{quote_char} || '';
54 my $quote = color 'bold red';
55 $quote .= '<' . (ord($quote_char) < 32 ? ord($quote_char) : $quote_char) . '>';
56 $quote .= color 'reset';
57 my $escape_char = $CSV_options{escape_char} || '';
58 my $escape = color 'bold green';
59 $escape .= '<' . (ord($escape_char) < 32 ? ord($escape_char) : $escape_char) . '>';
60 $escape .= color 'reset';
61 my $real_escape_char = chr(27);
62 my $real_escape = color 'bold green';
63 $real_escape .= '<27>';
64 $real_escape .= color 'reset';
66 $formatted_line =~ s/$real_escape_char/$real_escape/g;
67 $formatted_line =~ s/$sep_char/$sep/g;
68 $formatted_line =~ s/$quote_char/$quote/g;
69 $formatted_line =~ s/$escape_char/$escape/g;
70 for (my $i = 0; $i < 32; $i++) {
71 if ($i == 27) { next; }
72 my $other_char = chr($i);
73 my $other = color 'yellow';
75 $other .= color 'reset';
76 $formatted_line =~ s/$other_char/$other/g;
78 return "$formatted_line\n";
83 my $status = $csv->combine(@{ $row });
84 if ($status && $csv->string) {
85 return $csv->string . "\n";
87 die $csv->error_input . "\n";
91 sub convert_backslashes {
94 my @count = $line =~ /\\/g;
95 if (scalar(@count) > 0) {
96 my $csv2 = Text::CSV_XS->new(\%CSV_options);
97 if ($csv2->parse($line)) {
98 my @columns = $csv2->fields();
99 foreach my $c (@columns) {
104 $altered_line = combine_cols(\@columns);
106 $altered_line =~ s/\\/\//g;
108 if ($line ne $altered_line) {
109 $backslash_count += scalar(@count);
110 print "\nline#$line_no>> Converting " . scalar(@count) . " backslashes to forward slashes\n";
111 print "before: " . format_for_display($line);
112 $line = $altered_line;
113 print " after: " . format_for_display($line);
119 sub apply_line_fixes {
121 foreach my $fix ( @{$fixes->{'R'}} ) {
122 my $id_regex = $fix->[0];
123 if ($line =~ /$id_regex/) {
124 print "\nline#$line_no>> Applying regex fix for $id_regex\n";
126 my $regex1 = $fix->[1];
127 my $regex2 = $fix->[2];
128 my $global = $fix->[3];
129 my $ignore_case = $fix->[4];
130 print "before: " . format_for_display($line);
131 $line = fix_via_regex($id_regex,$line,$regex1,$regex2,$global,$ignore_case);
132 print " after: " . format_for_display($line);
138 sub apply_insert_fixes {
141 foreach my $fix ( @{$fixes->{'I'}} ) {
142 my $id_regex = $fix->[0];
143 my $col_count_check = $fix->[1];
144 if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
145 print "\nline#$line_no>> Applying insert for $id_regex\n";
147 my $fix_cols = $fix->[2];
148 my $fix_value = $fix->[3];
149 print "before: " . format_for_display($line);
150 $line = fix_via_insert($id_regex,$cols,$col_count_check,$fix_cols,$fix_value);
151 print " after: " . format_for_display($line);
157 sub apply_delete_fixes {
160 foreach my $fix ( @{$fixes->{'D'}} ) {
161 my $id_regex = $fix->[0];
162 my $col_count_check = $fix->[1];
163 if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
164 print "\nline#$line_no>> Applying delete for $id_regex\n";
166 my $fix_cols = $fix->[2];
167 print "before: " . format_for_display($line);
168 $line = fix_via_delete($id_regex,$cols,$col_count_check,$fix_cols);
169 print " after: " . format_for_display($line);
175 sub apply_join_fixes {
178 foreach my $fix ( @{$fixes->{'J'}} ) {
179 my $id_regex = $fix->[0];
180 my $col_count_check = $fix->[1];
181 if ($line =~ /$id_regex/ && scalar(@{$cols}) eq $col_count_check) {
182 print "\nline#$line_no>> Applying join for $id_regex\n";
184 my $fix_cols = $fix->[2];
185 print "before: " . format_for_display($line);
186 $line = fix_via_join($id_regex,$cols,$col_count_check,$fix_cols);
187 print " after: " . format_for_display($line);
194 print "saving fix...";
197 if ($nosave) { print "psyche!\n"; return; }
198 print "fix = " . Dumper($fix) . "\n" if $debug;
199 push @{$fixes->{$type}}, $fix;
200 store $fixes, $ARGV[0] . '.fixes';
206 my @f = sort(split /,/, $id_cols || '0');
208 for (my $i = 0; $i < scalar(@f); $i++) {
210 $regex .= '.+'; # characters between id columns
212 $regex .= '.?' . $cols->[$f[$i]] . '.?';
219 my $id_regex = shift;
221 my $fix_regex1 = shift;
222 my $fix_regex2 = shift;
224 my $ignore_case = shift;
229 print "Global (aka s/match/replace/g)? <Yes/No> [n] ";
230 $global = readline(STDIN); chomp $global;
231 $global = uc(substr($global,0,1));
235 $global = uc(substr($global,0,1));
236 if ($global ne 'Y' && $global ne 'N') {
240 print "Ignore-case (aka s/match/replace/i)? <Yes/No> [n] ";
241 $ignore_case = readline(STDIN); chomp $ignore_case;
242 $ignore_case = uc(substr($ignore_case,0,1));
243 if ($ignore_case eq '') {
246 $ignore_case = uc(substr($ignore_case,0,1));
247 if ($ignore_case ne 'Y' && $ignore_case ne 'N') {
252 print "Enter match regex for s/match/replace/: ";
253 $fix_regex1 = readline(STDIN); chomp $fix_regex1;
254 if ($fix_regex1 eq '') {
258 ($global eq 'Y' && $ignore_case eq 'Y' && $line =~ /$fix_regex1/gi)
259 || ($global eq 'Y' && $ignore_case eq 'N' && $line =~ /$fix_regex1/g)
260 || ($global eq 'N' && $ignore_case eq 'N' && $line =~ /$fix_regex1/i)
262 print "Regex matches line.\n";
264 print "Regex does not match line.\n";
268 print "Enter replace regex for s/match/replace/: ";
269 $fix_regex2 = readline(STDIN); chomp $fix_regex2;
270 if (substr($fix_regex1,-1) eq '$') {
271 print "Adding new line to end of /$fix_regex2/ based on \$ in /$fix_regex1/\n";
274 # TODO - how to do we handle backreferences with this?
278 switch ($global . $ignore_case) {
279 case 'YY' { $line =~ s/$fix_regex1/$fix_regex2/gi; }
280 case 'YN' { $line =~ s/$fix_regex1/$fix_regex2/g; }
281 case 'NY' { $line =~ s/$fix_regex1/$fix_regex2/i; }
282 case 'NN' { $line =~ s/$fix_regex1/$fix_regex2/; }
299 my $id_regex = shift;
301 my $col_count_check = shift;
302 my $fix_cols = shift;
303 my $fix_value = shift;
308 $col_count_check = scalar(@{$cols});
309 print "This fix will only trigger when the number of columns is $col_count_check.\n";
310 print "Enter value to insert: [] ";
311 $fix_value = readline(STDIN); chomp $fix_value;
312 print "Enter comma-separated list of column positions (0-based) for insertion: ";
313 $fix_cols = readline(STDIN); chomp $fix_cols;
317 if ($col_count_check != scalar(@{$cols})) {
318 print "WARNING: Insert column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ". Skipping.\n";
322 my @f = sort(split /,/, $fix_cols);
323 for (my $i = 0; $i < scalar(@f); $i++) {
324 splice @{ $cols }, $f[$i] + $i, 0, $fix_value;
328 $line = combine_cols($cols);
331 print "fix_via_insert error:\n";
348 my $id_regex = shift;
350 my $col_count_check = shift;
351 my $fix_cols = shift;
356 $col_count_check = scalar(@{$cols});
357 print "This fix will only trigger when the number of columns is $col_count_check.\n";
358 print "Enter comma-separated list of column positions (0-based) to delete: ";
359 $fix_cols = readline(STDIN); chomp $fix_cols;
363 if ($col_count_check != scalar(@{$cols})) {
364 print "WARNING: Delete column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ". Skipping.\n";
368 my @f = sort(split /,/, $fix_cols);
369 for (my $i = 0; $i < scalar(@f); $i++) {
370 splice @{ $cols }, $f[$i] - $i, 1;
374 $line = combine_cols($cols);
377 print "fix_via_delete error:\n";
393 my $id_regex = shift;
395 my $col_count_check = shift;
396 my $fix_cols = shift;
401 $col_count_check = scalar(@{$cols});
402 print "This fix will only trigger when the number of columns is $col_count_check.\n";
403 print "Enter comma-separated list of column positions (0-based) to join: ";
404 $fix_cols = readline(STDIN); chomp $fix_cols;
408 if ($col_count_check != scalar(@{$cols})) {
409 print "WARNING: Join column fix with ID regex $id_regex expected $col_count_check but found " . scalar(@{$cols}) . ". Skipping.\n";
415 my @f = sort { $a <=> $b } (split /,/, $fix_cols);
416 for (my $i = 0; $i < scalar(@f); $i++) {
417 $fix_value .= @{ $cols }[$f[$i]];
421 for (my $i = 0; $i < scalar(@f); $i++) {
422 splice @{ $cols }, $f[$i] - $i, 1;
426 splice @{ $cols }, $f[0], 0, $fix_value;
429 $line = combine_cols($cols);
432 print "fix_via_join error:\n";
451 my $insert_delete_allowed = shift;
452 my $col_count = scalar(@{$cols}) > scalar(@headers) ? scalar(@{$cols}) : scalar(@headers);
453 my $max_header_length = 0;
455 # display columns nice and formatted
456 for (my $i = 0; $i < scalar(@headers); $i++) {
457 if (length($headers[$i]) > $max_header_length) {
458 $max_header_length = length($headers[$i]);
461 for (my $i = 0; $i < $col_count; $i++) {
463 if (defined $headers[$i]) {
464 print " " x ($max_header_length - length($headers[$i]));
467 print " " x $max_header_length;
470 if (defined $cols->[$i]) {
471 print "<" . $cols->[$i] . ">";
476 # prompt for type of fix
478 print "\n" . format_for_display($line) . "\nFix line#$line_no? <Regex" . ($insert_delete_allowed ? '|Insert|Delete|Join' : '') . "|No> [n] ";
479 my $ans = readline(STDIN); chomp $ans;
480 $ans = uc(substr($ans,0,1));
484 if ($insert_delete_allowed ? index("RIDJN",$ans)==-1 : index("RN",$ans)==-1) {
488 # prompt for matching condition
491 my $default_id_regex = id_cols_regex($cols);
493 print "If matching the end of the string, you may need to use \\s*\$ instead of \$\n";
494 print "Insert/Delete/Join fixes will also filter on column count.\n";
495 print "Identify this line (and optionally similar lines) with regex: [$default_id_regex] ";
496 $id_regex = readline(STDIN); chomp $id_regex;
497 if ($id_regex eq '') {
498 $id_regex = $default_id_regex;
500 if ($line =~ /$id_regex/) {
501 print "Regex matches line.\n";
503 print "Regex does not match line.\n";
508 # prompt and perform actual fixes
510 case 'R' { $line = fix_via_regex($id_regex,$line); }
511 case 'I' { $line = fix_via_insert($id_regex,$cols); }
512 case 'D' { $line = fix_via_delete($id_regex,$cols); }
513 case 'J' { $line = fix_via_join($id_regex,$cols); }
519 print "\nNew line#$line_no: $line";
521 return ( $ans, $line );
524 ################################################################## Init
526 'config=s' => \$config,
527 'idcols=s' => \$id_cols,
528 'create-headers' => \$create_headers,
529 'use-headers=s' => \$headers_file,
531 'nosave' => \$nosave,
534 'truncate' => \$truncate,
535 'backslash' => \$backslash,
539 if ($help || ((@ARGV == 0) && (-t STDIN))) {
540 die "\n\t$0 [--config <CONFIG>] [--idcols <idx1,idx2,...>] [--fix] [--apply] [--pad] [--truncate] <FILE>\n\n"
541 . "\tExpects <FILE> to be a CSV-like UTF-8 encoded file.\n"
542 . "\tWill produce <FILE>.clean and <FILE>.error versions of said file.\n\n"
543 . "\t--config <CONFIG> will read the Perl file <CONFIG> for settings information. See 'Example Config' below\n\n"
544 . "\t--create-headers will generate headers like so: col1, col2, col3, etc.\n"
545 . "\t--use-headers <HFILE> will generate headers based on the specified <HFILE>, which must contain one column header per line.\n"
546 . "\t(if neither --create-headers nor --use-headers are specified, then the first line in <FILE> is assumed to contain the column headers)\n\n"
547 . "\t--fix will prompt for whether and how to fix broken records, and save those fixes in <FILE>.fixes\n"
548 . "\t--idcols <idx1,idx2,...> takes a comma-separated list of column indexes (starting with 0) to use as matchpoint suggestions for fixes\n"
549 . "\t--nosave will prevent new fixes from being saved in <FILE>.fixes\n"
550 . "\t--apply will apply previously recorded fixes from <FILE>.fixes\n\n"
551 . "\t--pad will fill in missing columns at the end if needed for otherwise unbroken records\n"
552 . "\t--truncate will strip extra columns from the end if needed for otherwise unbroken records\n"
553 . "\t--backslash will convert backslashes into forward slashes\n\n"
554 . "\t Example Config:\n\n"
555 . "\t\t\$CSV_options{quote_char} = '\"';\n"
556 . "\t\t\$CSV_options{escape_char} = '\"';\n"
557 . "\t\t\$CSV_options{sep_char} = ',';\n"
558 . "\t\t\$CSV_options{eol} = \$\\;\n"
559 . "\t\t\$CSV_options{always_quote} = 0;\n"
560 . "\t\t\$CSV_options{quote_space} = 1;\n"
561 . "\t\t\$CSV_options{quote_null} = 1;\n"
562 . "\t\t\$CSV_options{quote_binary} = 1;\n"
563 . "\t\t\$CSV_options{binary} = 0;\n"
564 . "\t\t\$CSV_options{decode_utf8} = 1;\n"
565 . "\t\t\$CSV_options{keep_meta_info} = 0;\n"
566 . "\t\t\$CSV_options{allow_loose_quotes} = 0;\n"
567 . "\t\t\$CSV_options{allow_loose_escapes} = 0;\n"
568 . "\t\t\$CSV_options{allow_unquoted_escape} = 0;\n"
569 . "\t\t\$CSV_options{allow_whitespace} = 0;\n"
570 . "\t\t\$CSV_options{blank_is_undef} = 0;\n"
571 . "\t\t\$CSV_options{empty_is_undef} = 0;\n"
572 . "\t\t\$CSV_options{verbatim} = 0;\n"
576 die "$ARGV[0] does not exist\n";
578 if ($config && ! -e $config) {
579 die "$config does not exist\n";
581 if ($apply && -e $ARGV[0] . '.fixes') {
582 $fixes = retrieve($ARGV[0] . '.fixes');
585 ################################################################## CSV Setup
586 $CSV_options{sep_char} = get_separator( path => $ARGV[0], lucky => 1 );
587 if ($config && -e $config) {
590 $csv = Text::CSV_XS->new(\%CSV_options);
593 my ($err, $msg, $pos, $recno) = @_;
594 return if ($err == 2012);
595 $line_numbers_for_lines_with_errors{$line_no} = 1;
596 print "\nline#$line_no * $err : $msg -> (pos#$pos,rec#$recno)\n";
597 if ($csv->error_input) {
598 print $csv->error_input;
599 print "-" x ($pos - 1);
607 ################################################################## Reading
610 print "_.,-~= reading $headers_file\n";
611 open my $hfile, "<:encoding(utf8)", $headers_file or die "$headers_file: $!";
612 while (my $line = <$hfile>) {
616 push @headers, $line;
619 $expected_column_count = scalar(@headers);
620 print "Expected column count set to $expected_column_count based on headers.\n";
623 print "_.,-~= reading $ARGV[0]\n";
624 open my $in, "<:encoding(utf8)", $ARGV[0] or die "$ARGV[0]: $!";
626 while (my $line = <$in>) {
627 print ">>> main loop (#$line_no): $line" if $debug;
629 $line = convert_backslashes($line);
632 $line = apply_line_fixes($line);
634 if ($csv->parse($line)) {
635 my @columns = $csv->fields();
636 if (! $expected_column_count) {
637 $expected_column_count = scalar(@columns);
638 print "Expected column count set to $expected_column_count based on first row.\n";
639 for (my $i = 0; $i < scalar(@columns) ; $i++) {
640 if ($create_headers) {
641 push @headers, "col" . ($i+1);
643 push @headers, $columns[$i];
647 if (defined $line_numbers_for_lines_with_errors{$line_no}) {
650 ($fix_status,$line) = manual_fix($line,\@columns,0); # Regex only
651 if ($fix_status ne 'N') {
652 delete $line_numbers_for_lines_with_errors{$line_no};
658 if (scalar(@columns) < $expected_column_count) {
660 my $new_line = apply_insert_fixes($line,\@columns);
661 if ($line ne $new_line) {
668 print "\nline#$line_no>> padding line, from " . scalar(@columns) . " columns ";
669 my $col_count = scalar(@columns);
670 for (my $i = 0; $i < $expected_column_count - $col_count; $i++) {
671 push @columns, '#pad#';
673 print "to " . scalar(@columns) . " columns.\n";
675 print "before: " . format_for_display($line);
676 $line = combine_cols(\@columns);
677 print " after: " . format_for_display($line);
680 print "padding error:\n";
686 if (scalar(@columns) > $expected_column_count) {
688 my $new_line = apply_delete_fixes($line,\@columns);
689 if ($line ne $new_line) {
693 $new_line = apply_join_fixes($line,\@columns);
694 if ($line ne $new_line) {
701 print "\nline#$line_no>> truncating line, from " . scalar(@columns) . " columns ";
702 splice @columns, $expected_column_count;
703 print "to " . scalar(@columns) . " columns.\n";
705 print "before: " . format_for_display($line);
706 $line = combine_cols(\@columns);
707 print " after: " . format_for_display($line);
710 print "truncating error:\n";
716 if (scalar(@columns) != $expected_column_count) {
717 # so broken, but parseable, and thus not handled by the error callback
718 print "\nline#$line_no * Expected $expected_column_count columns but found " . scalar(@columns) . "\n$line";
719 print "-" x length($line) . "\n";
720 $line_numbers_for_lines_with_errors{$line_no} = 1;
723 ($fix_status,$line) = manual_fix($line,\@columns,1); # Insert/Delete allowed
724 if ($fix_status ne 'N') {
725 delete $line_numbers_for_lines_with_errors{$line_no};
732 if (defined $line_numbers_for_lines_with_errors{$line_no}) {
733 print "\tIncrementing errors with line# $line_no\n" if $debug;
734 push @lines_with_errors, $line;
736 print "\tIncrementing clean with line# $line_no\n" if $debug;
737 push @parsed_rows, \@columns;
741 die "Parsing error:\n" . $csv->error_input . "\n";
745 print "_.,-~= read " . ($line_no-1) . " records ";
746 print "(" . scalar(@lines_with_errors) . " with errors, $pad_count auto-padded, $trunc_count auto-truncated, $backslash_count backslashes converted, $fix_count manual fixes)\n";
749 ################################################################## Writing good CSV
751 print "_.,-~= writing $ARGV[0].clean\n";
752 open my $out, ">:encoding(utf8)", "$ARGV[0].clean" or die "$ARGV[0].clean: $!";
755 if ($create_headers || $headers_file) {
756 unshift @parsed_rows, \@headers;
758 foreach my $row (@parsed_rows) {
760 $line = combine_cols($row);
770 print "_.,-~= wrote " . ($actual_count) . " records\n";
773 ################################################################## Writing broken CSV
775 print "_.,-~= writing $ARGV[0].error\n";
776 open my $out2, ">:encoding(utf8)", "$ARGV[0].error" or die "$ARGV[0].error: $!";
777 foreach my $row (@lines_with_errors) {
781 print "_.,-~= wrote " . (scalar @lines_with_errors) . " records\n";
784 ################################################################## .no_headers version
786 print "_.,-~= creating $ARGV[0].clean.no_headers\n";
788 print `tail -n +2 $ARGV[0].clean > $ARGV[0].clean.no_headers`;
790 ################################################################## Finished
792 print "_.,-~= finished\n";