1 package Equinox::Migration::MARCXMLSampler;
7 use Equinox::Migration::SimpleTagList 1.001;
12 Equinox::Migration::MARCXMLSampler
20 our $VERSION = '1.003';
28 Produce a list of all fields in a MARCXML file which have a C<tag>
29 attribute, and count how many times each occurs
31 my $s = E::M::MARCXMLSampler->new( marcfile => "foo.marc.xml" );
34 Also deeply introspect certain tags, producing lists of all subfields,
35 and counts of how many times each subfield occurs I<in toto> and how
36 many records each subfield appears in
38 my $s = E::M::MARCXMLSampler->new( marcfile => "foo.marc.xml",
39 mapfile => "foo.map" );
42 my $s = E::M::MARCXMLSampler->new( marcfile => "foo.marc.xml",
43 mapstring => "852 999" );
52 Takes one required argument, C<marcfile>, which points to the MARCXML
55 Has two mutually-exclusive optional arguments, C<mapfile> and
56 C<mapstring>". The former should point to a file which will be used as
57 a L<Equinox::Migration::SimpleTagList> map; the latter should have as
58 its value a text string which will be used in the same way (handy for
59 when you only want deep introspection on a handful of tags).
64 my ($class, %args) = @_;
66 $dstore = { rcnt => 0, # record counter
67 tcnt => 0, # tag counter
68 scnt => {}, # subfield/tag counters
69 samp => {}, # data samples
70 tags => {}, # all found tags
73 my $self = bless { data => $dstore,
76 # if we have a sample arg, create the sample map
77 die "Can't use a mapfile and mapstring\n"
78 if ($args{mapfile} and $args{mapstring});
79 $taglist = Equinox::Migration::SimpleTagList->new(file => $args{mapfile})
81 $taglist = Equinox::Migration::SimpleTagList->new(str => $args{mapstring})
82 if ($args{mapstring});
84 # initialize twig and process xml
85 die "Argument 'marcfile' must be specified\n" unless ($args{marcfile});
86 if (-r $args{marcfile}) {
87 my $xmltwig = XML::Twig->new( twig_handlers => { record => \&parse_record } );
88 $xmltwig->parsefile( $args{marcfile} );
90 die "Can't open marc file: $!\n";
93 # hand ourselves back for datastore manipulation
100 XML::Twig handler for record elements; drives data extraction process.
105 my ($twig, $record) = @_;
107 my @fields = $record->children;
109 { process_field($f) }
111 # cleanup memory and increment pointer
119 my $tag = $field->{'att'}->{'tag'};
120 return unless ($tag and ($tag =~ /[^0-9]/ or $tag > 9));
122 # increment raw tag count
124 $dstore->{tags}{$tag}++;
127 if ($taglist and $taglist->has($tag)) {
128 my @subs = $field->children('subfield');
131 { process_subs($tag, $sub); $i++ }
133 # increment sub length counter
134 $dstore->{scnt}{$tag}{$i}++;
139 my ($tag, $sub) = @_;
140 my $code = $sub->{'att'}->{'code'};
142 # handle unmapped tag/subs
143 my $samp = $dstore->{samp};
144 # set a value, total-seen count and records-seen-in count
145 $samp->{$tag}{$code}{value} = $sub->text unless ($samp->{$tag}{$code}{value} and
146 $samp->{$tag}{$code}{value} =~ /\w/);
147 $samp->{$tag}{$code}{count}++;
148 $samp->{$tag}{$code}{tcnt}++ unless ( defined $samp->{$tag}{$code}{last} and
149 $samp->{$tag}{$code}{last} == $dstore->{tcnt} );
150 $samp->{$tag}{$code}{last} = $dstore->{tcnt};
156 If the C<mapfile> or C<mapstring> arguments are passed to L</new>, a
157 structure will be constructed which holds data about tags in the map.
160 sub_code => { value => VALUE,
169 For each subfield in each mapped tag, there is a hash of data about
170 that subfield containing
172 * value - A sample of the subfield text
173 * count - Total number of times the subfield was seen
174 * tcnt - The number of tags the subfield was seen in
178 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
182 Please report any bugs or feature requests to the above email address.
186 You can find documentation for this module with the perldoc command.
188 perldoc Equinox::Migration::MARCXMLSampler
191 =head1 COPYRIGHT & LICENSE
193 Copyright 2009 Equinox, all rights reserved.
195 This program is free software; you can redistribute it and/or modify it
196 under the same terms as Perl itself.
201 1; # End of Equinox::Migration::MARCXMLSampler