2 # -*- coding: iso-8859-15 -*-
3 ###############################################################################
6 =item B<bibstats> --file foo.mrc
8 Reads through a marc file to generate statistical information about the file
11 --uri_threshold defaults to 1, only shows URI values with more than that
14 --ingore_filetype true will have it not care what file returns as the type and
15 always treat it as marc21
20 ###############################################################################
27 HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
28 MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
39 my $mig_bin = "$FindBin::Bin/";
40 use lib "$FindBin::Bin/";
42 use open ':encoding(utf8)';
44 pod2usage(-verbose => 2) if defined $ARGV[0] && $ARGV[0] eq '--help';
45 pod2usage(-verbose => 1) if ! $ARGV[1];
48 my $uri_threshold = 1;
50 my $p_barcode_subfield;
51 my $p_ils_name = 'Runtime ILS';
52 my $holding_threshold = 50;
53 my $p_ignore_filetype = 'false';
57 'uri_threshold:i' => \$uri_threshold,
58 'holding_code:s' => \$p_holding_code,
59 'barcode:s' => \$p_barcode_subfield,
60 'ignore_filetype:s' => \$p_ignore_filetype,
61 'ils_name:s' => \$p_ils_name,
62 'holding_threshold:s' => \$holding_threshold
65 if ($p_holding_code and length $p_holding_code != 3) { abort('Holdings codes must be three characters.'); }
67 if ($p_barcode_subfield) {
68 if (!defined $p_holding_code) { abort('A barcode field can not be used without a holding code.'); }
69 if (length $p_barcode_subfield != 1) { abort('Barcode subfields must be a single character code.'); }
73 ['Mandarin','852','p'],
74 ['Evergreen','852','p'],
75 ['Polaris','852','p'],
82 if ($p_holding_code) {
83 push @temp, $p_ils_name;
84 push @temp, $p_holding_code;
85 if ($p_barcode_subfield) { push @temp, lc $p_barcode_subfield; }
91 my $batch = MARC::Batch->new('USMARC', $file);
93 my $filetype = `file $file`;
94 if ($filetype =~ m/MARC21/ or $p_ignore_filetype eq 'true') { print "$filetype.\n" }
95 else { abort("File is not MARC21."); }
99 my $uri_valid_count = 0;
100 my $uri_sub9_count = 0;
106 my @holding_code_strings;
111 $holding_counts{@$_[0]} = 0;
112 $barcode_counts{@$_[0]} = 0;
115 while ( my $record = $batch->next() ) {
117 #check holdings, bit time consuming but more future proof
121 my $barcode = @$_[2];
122 my @holding_fields = $record->field($hcode);
123 my $l = scalar @holding_fields;
124 my $v = $holding_counts{$ils};
125 if ($l) { $holding_counts{$ils} = $v + $l; }
128 @fields = $record->field('856');
129 my $ldr = substr $record->leader(), 9, 1;
131 foreach my $f (@fields) {
132 my $u = $f->subfield('u');
133 my $n = $f->subfield('9');
134 if (defined $n) { $uri_sub9_count++; }
137 my $ind1 = $f->indicator('1');
138 my $ind2 = $f->indicator('2');
140 if ($ind2 eq '0' or $ind2 eq '1') { $uri_valid_count++; }
142 my $ustring = lc $f->as_string('u');
143 $ustring =~ s/http:\/\///;
144 $ustring =~ s/ftp:\/\///;
145 $ustring =~ s/https:\/\///;
146 $ustring =~ s/\/.*//;
147 push @uris, $ustring;
150 #check for authority linking on 100s and 245s, if present may need to scrub them
151 @fields = $record->field('100');
152 foreach my $f (@fields) {
153 my $t = $f->subfield('0');
154 if (defined $t) { $title_sub0++; }
156 @fields = $record->field('245');
157 foreach my $f (@fields) {
158 my $t = $f->subfield('0');
159 if (defined $t) { $author_sub0++; }
161 if(($i % 1000) == 0) { print "Processing bib $i.\n"; }
165 $uri_counts{$_}++ for @uris;
168 $code_counts{$_}++ for @codes;
170 print "\n$filetype\n";
171 print "$i bibs read in file\n\n";
173 print "=== Leader 09 codes\n";
174 foreach my $key (keys %code_counts) {
175 my $value = $code_counts{$key};
176 print "=== $key $value\n";
180 print "$uri_count 856 fields with a subfield u\n";
181 print "$uri_valid_count 856 fields with a subfield u and valid indicators\n";
182 print "$uri_sub9_count 856 fields have subfield 9s\n";
183 print "$title_sub0 100 fields have a subfield 0\n";
184 print "$author_sub0 245 fields have a subfield 0\n";
186 print "\n=== Holdings Analysis\n";
187 foreach my $key (keys %holding_counts) {
188 my $c = $holding_counts{$key};
189 if (((100/$i)*$c) >= $holding_threshold) { print "Could be $key $holding_counts{$key} holdings tags\n"; }
192 print "\nURI values are domains and filtered to only show those with more than $uri_threshold\n";
193 foreach my $key (keys %uri_counts) {
194 my $value = $uri_counts{$key};
195 if ($value > $uri_threshold) { print "=== $key $value\n"; }
200 ########### functions
204 print STDERR "$0: $msg", "\n";