From: Shawn Boyette Date: Thu, 16 Apr 2009 06:58:27 +0000 (+0000) Subject: at least it compiles now, and the docs are mostly in place X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=commitdiff_plain;h=6122a9e8332fdd507a98251a9eed96662e05647a at least it compiles now, and the docs are mostly in place --- diff --git a/Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm b/Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm index 609468c..3eb3766 100644 --- a/Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm +++ b/Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm @@ -1,4 +1,4 @@ -package Equinox::Migration::MapDrivenXMLProc; +package Equinox::Migration::MapDrivenMARCXMLProc; use warnings; use strict; @@ -8,7 +8,7 @@ use Equinox::Migration::SubfieldMapper; =head1 NAME -Equinox::Migration::MapDrivenXMLProc +Equinox::Migration::MapDrivenMARCXMLProc =head1 VERSION @@ -23,124 +23,207 @@ our $VERSION = '1.000'; Foo - use Equinox::Migration::MapDrivenXMLProc; + use Equinox::Migration::MapDrivenMARCXMLProc; -=cut +=head1 METHODS +=head2 new -=head1 METHODS +Takes two required arguments: C (which will be passed along +to L as the basis for its map), +and C (the MARC data to be processed). + my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE, + marcfile => FILE ); -=head2 new +There is an optional third, argument, C, which specifies a +arrayref of datafields to "sample" by reporting on subfields which are +found in the data but not in the map. + + my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE, + marcfile => FILE, + sample => \@TAGS + ); + +See L for more info. =cut sub new { my ($class, %args) = @_; - my $self = bless { conf => { count => 0, - total => 0, - quiet => 0, + my $self = bless { mods => { multi => {}, + once => {}, + required => {}, + }, + data => { recs => undef, # X::T record objects + rptr => 0, # next record pointer + crec => undef, # parsed record storage + stag => undef, # list of tags to sample + umap => undef, # unmapped data samples }, - map => Equinox::Migration::SubfieldMapper->new(file => $args{mapfile}), - tags => {}, - twig => XML::Twig->new( twig_handlers => { record => \&record } ), }, $class; - if ($args{marcfile}) { - if (-r $args{marcfile}) { - $self->{conf}{marc} = $args{marcfile}; - $self->generate; - } else { - die "Can't open marc file: $!\n"; - } + # initialize map and taglist + my @mods = keys %{$self->{mods}}; + $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile}, + mods => \@mods ); + $self->{tags} = $self->{map}->tags; + + # initialize twig + die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile}); + if (-r $args{marcfile}) { + $self->{twig} = XML::Twig->new; + $self->{twig}->parsefile($args{marcfile}); + my @records = $self->{twig}->root->children; + $self->{data}{recs} = \@records; + } else { + die "Can't open marc file: $!\n"; } - $self->{twig}->parsefile($self->{conf}{marc}); - return $self; } -sub parse { - my ($self) = @_; -} +=head2 parse_record -sub emit_status { - my ($self) = @_; - my $c = $self->{conf}; - return if $c->{quiet}; - $c->{count}++; - my $percent = int(($c->{count} / $c->{total}) * 100); - print STDERR "\r$percent% done (", $c->{count}, ")"; -} +Extracts data from the next record, per the mapping file. Returns 1 on +success, 0 otherwise. + while ($m->parse_record) { + # handle extracted record data + } -=head2 XML::Twig CALLBACK ROUTINES +=cut -=head3 record +sub parse_record { + my ($self) = @_; -=cut + # get the next record and wipe current parsed record + return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ]; + my $record = $self->{data}{recs}[ $self->{data}{rptr} ]; + $self->{data}{crec} = {}; -sub record { - my($t, $r)= @_; - $self->{holdings} = {}; + my @fields = $record->children; + for my $f (@fields) + { $self->process_field($f) } - my @dfields = $r->children('datafield'); - for my $d (@dfields) { - process_datafields($d); - } - write_data_out(); - $r->purge; + # cleanup memory and increment pointer + $record->purge; + $self->{data}{rptr}++; } -=head3 process_datafields +=head2 process_field =cut -sub process_datafields { - my ($d) = @_; +sub process_field { + my ($self, $field) = @_; my $map = $self->{map}; - my $tag = $d->{'att'}->{'tag'}; + my $tag = $field->{'att'}->{'tag'}; + my $parsed = $self->{data}{crec}; if ($tag == 903) { - my $s = $d->first_child('subfield'); - $self->{holdings}{id} = $s->text;; + my $sub = $field->first_child('subfield'); + $parsed->{egid} = $sub->text;; } elsif ($map->has($tag)) { - push @{$self->{holdings}{copies}}, { tag => $tag }; - my @subs = $d->children('subfield'); + push @{$parsed->{tags}}, { tag => $tag }; + my @subs = $field->children('subfield'); for my $sub (@subs) - { process_subs($tag, $sub) } + { $self->process_subs($tag, $sub) } } } -=head3 process_subs +=head2 process_subs =cut sub process_subs { - my ($tag, $sub) = @_; + my ($self, $tag, $sub) = @_; my $map = $self->{map}; my $code = $sub->{'att'}->{'code'}; + # handle unmapped tag/subs unless ($map->has($tag, $code)) { - # this is a subfield code we don't have mapped. report on it if this is a sample tag - push @{$c->{sample}{$tag}}, $code if defined $c->{sample}{tag}; + my $u = $self->{data}{umap}; + my $s = $self->{data}{stag}; + return unless (defined $s->{$tag}); + + $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code}; + $u->{$tag}{$code}{count}++; return; } - my $copy = $self->{holdings}{copies}[-1]; + my $data = $self->{data}{crec}{tags}[-1]; my $field = $map->field($tag, $code); - if ($map->mod($field) eq 'multi') { + if ($map->mod($field) eq 'multi') { my $name = $tag . $code; - push @{$copy->{multi}{$name}}, $sub->text; + push @{$data->{multi}{$name}}, $sub->text; } else { - $copy->{uni}{$code} = $sub->text; + $data->{uni}{$code} = $sub->text; } } +=head1 PARSED RECORDS + + { + egid => evergreen_record_id, + bib => { + (tag_id . sub_code)1 => value1, + (tag_id . sub_code)2 => value2, + ... + }, + tags => [ + { + tag => tag_id, + multi => { (tag_id . sub_code) => [ val1, val2, ... ] }, + uni => { code => value, code2 => value2, ... }, + }, + ... + ] + } + +That is, there is an C key which points to the Evergreen ID of +that record, a C key which points to a hashref, and a C +key which points to an arrayref. + +=head3 C + +This hashref holds extracted data which should occur once per record +(the default assumption is that a tag/subfield pair can occur multiple +times per record). The keys are composed of tag id and subfield code, +catenated (e.g. 901c). The values are the contents of that subfield of +that tag. + +=head3 C + +This arrayref holds anonymous hashrefs, one for each instance of each +tag which occurs in the map. Each tag hashref holds its own id +(e.g. C<998>), and two more hashrefs, C and C. + +The C hashref holds the extracted data for tag/sub mappings +which have the C modifier on them. The keys in C are +composed of the tag id and subfield code, catenated +(e.g. C<901c>). The values are arrayrefs containing the content of all +instances of that subfield in that instance of that tag. + +The C hashref holds data for tag/sub mappings which occur only +once per instance of a tag (but may occur multiple times in a record +due to there being multiple instances of that tag in a record). Keys +are subfield codes and values are subfield content. + +=head1 UNMAPPED TAGS + + { tag_id => { + sub_code => { value => VALUE, count => COUNT }, + sub_code2 => { value => VALUE, count => COUNT }, + ... + }, + ... + } + =head1 AUTHOR Shawn Boyette, C<< >> @@ -153,7 +236,7 @@ Please report any bugs or feature requests to the above email address. You can find documentation for this module with the perldoc command. - perldoc Equinox::Migration::MapDrivenXMLProc + perldoc Equinox::Migration::MapDrivenMARCXMLProc =head1 COPYRIGHT & LICENSE @@ -166,4 +249,4 @@ under the same terms as Perl itself. =cut -1; # End of Equinox::Migration::MapDrivenXMLProc +1; # End of Equinox::Migration::MapDrivenMARCXMLProc