From: Rogan Hamby Date: Tue, 15 Oct 2019 14:52:29 +0000 (-0400) Subject: adding some basic scripts for cleaning up text from mandarin pdf reprots to make... X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=commitdiff_plain;h=d32d64cbc0b841f0d2988877228b03d3612d9e80 adding some basic scripts for cleaning up text from mandarin pdf reprots to make it delimited --- diff --git a/mandarin/clean_mandarin_fine_list_report.pl b/mandarin/clean_mandarin_fine_list_report.pl new file mode 100755 index 0000000..6efc323 --- /dev/null +++ b/mandarin/clean_mandarin_fine_list_report.pl @@ -0,0 +1,40 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +# copy and paste raw text from acrobat reader to .txt file + +my $filename = $ARGV[0]; +open(my $fh, '<:encoding(UTF-8)', $filename) + or die "Could not open file '$filename' $!"; + +my $user; + +while (my $row = <$fh>) { + chomp $row; + if ($row =~ m/Fines Due by patron/) { next; } + if ($row =~ m/Patron Title Barcode Date Due Fine/) { next; } + if ($row =~ m/Page: /) { next; } + if ($row =~ m/Program Files \(x86\)/) { next; } + if ($row =~ m/End Of Report/) { next; } + if ($row =~ m/Fines Due by Patron/) { next; } + + my @str = split / /, $row; + my $str_length = scalar(@str); + if ($str[1] eq 'AM' or $str[1] eq 'PM') { + if ($str[2] =~ m/2019/) { next; } + } + + if ($str[$str_length -1] !~ m/^\d*\.?\d*$/) { + $user = $str[$str_length -1]; + next; + } + + #print "$row\n"; + if ($str[$str_length -1] =~ m/^\d*\.?\d*$/) { + my $f = $str[$str_length -1]; #fine + my $i = $str[$str_length -3]; #item barcode + print "$user\t$i\t$f\n"; + } +} diff --git a/mandarin/clean_mandarin_hold_list_by_patron.pl b/mandarin/clean_mandarin_hold_list_by_patron.pl new file mode 100755 index 0000000..2ad084b --- /dev/null +++ b/mandarin/clean_mandarin_hold_list_by_patron.pl @@ -0,0 +1,40 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +# copy and paste raw text from acrobat reader to .txt file + +my $filename = $ARGV[0]; +open(my $fh, '<:encoding(UTF-8)', $filename) + or die "Could not open file '$filename' $!"; + +my $user; + +while (my $row = <$fh>) { + chomp $row; + if ($row =~ m/Items On Hold by Patron/) { next; } + if ($row =~ m/Patron Title Barcode /) { next; } + if ($row =~ m/Page: /) { next; } + if ($row =~ m/Program Files \(x86\)/) { next; } + if ($row =~ m/End Of Report/) { next; } + + my @str = split / /, $row; + my $str_length = scalar(@str); + if ($str[1] eq 'AM' or $str[1] eq 'PM') { + if ($str[2] =~ m/2019/) { next; } + } + + if ($str[$str_length -1] !~ m/\d\d\/\d\d\/\d\d\d\d/) { + $user = $str[$str_length -1]; + next; + } + + #print "$row\n"; + if ($str[$str_length -1] =~ m/\d\d\/\d\d\/\d\d\d\d/) { + my $posted = $str[$str_length -1]; + my $expire = $str[$str_length -2]; + my $item = $str[$str_length -3]; #item + print "$user\t$item\t$posted\t$expire\n"; + } +} diff --git a/mandarin/clean_mandarin_loan_list_report.pl b/mandarin/clean_mandarin_loan_list_report.pl new file mode 100755 index 0000000..b19c1c6 --- /dev/null +++ b/mandarin/clean_mandarin_loan_list_report.pl @@ -0,0 +1,40 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +# copy and paste raw text from acrobat reader to .txt file + +my $filename = $ARGV[0]; +open(my $fh, '<:encoding(UTF-8)', $filename) + or die "Could not open file '$filename' $!"; + +my $user; + +while (my $row = <$fh>) { + chomp $row; + if ($row =~ m/Items On Loan by Patron/) { next; } + if ($row =~ m/Patron Title Barcode /) { next; } + if ($row =~ m/Page: /) { next; } + if ($row =~ m/Program Files \(x86\)/) { next; } + if ($row =~ m/End Of Report/) { next; } + + my @str = split / /, $row; + my $str_length = scalar(@str); + if ($str[1] eq 'AM' or $str[1] eq 'PM') { + if ($str[2] =~ m/2019/) { next; } + } + + if ($str[$str_length -1] !~ m/\d\d\/\d\d\/\d\d\d\d/) { + $user = $str[$str_length -1]; + next; + } + + #print "$row\n"; + if ($str[$str_length -1] =~ m/\d\d\/\d\d\/\d\d\d\d/) { + my $due = $str[$str_length -1]; + my $borrowed = $str[$str_length -2]; + my $item = $str[$str_length -3]; #item + print "$user\t$item\t$due\t$borrowed\n"; + } +}