On Mon, Mar 28, 2016 at 02:01:42PM +0000, Jiao, Dazhi wrote:
> At IU we are running our OPAC using Blacklight. From time to time
> there are some catalog errors that would cause errors in our custom
> code to extract fields from the MARC records. For example, sometimes a
> subfield may unexpectedly appear in a field, or an expected subfield
> is accidentally named to another subfield.
>
> While we can catch these errors in our code, we’d also like to be able
> to discover them and notify the catalogers before the records are
> exposed in the discover layer. I wonder if anyone here has experiences
> with some MARC validation tool for this purpose?
Are you talking about low-level structural problems, like the record
length field (Ldr/00-05) not matching the actual record length? Or
high-level errors, like a 245 field without a subfield $a or an invalid
country code?
If it's the latter, Bryan Baldus wrote a Perl module (MARC::Lint) that
looks pretty comprehensive:
https://metacpan.org/release/MARC-Lint
It includes a script (marclint) that you can use directly from the
command line.
If it's the former, I've attached a Perl script I wrote (marcdiag) that
catches most low-level errors. It only has one dependency --
Getopt::Long, which is pretty standard but not part of a core Perl
installation.
Paul.
--
Paul Hoffman <[log in to unmask]>
Systems Librarian
Fenway Libraries Online
c/o Wentworth Institute of Technology
550 Huntington Ave.
Boston, MA 02115
(617) 442-2384 (FLO main number)
#!/m1/shared/bin/perl
use strict;
use warnings;
use bytes;
use Getopt::Long
qw(:config posix_default gnu_compat require_order bundling no_ignore_case);
use constant RECORD_TERMINATOR => "\x1d";
use constant FIELD_TERMINATOR => "\x1e";
use constant SUBFIELD_DELIMITER => "\x1f";
sub emit;
sub summarize;
sub record;
sub fatal;
sub info;
sub error;
sub warning;
sub ok;
sub fail;
my $rx_record_terminator = qr/\x1d/;
my $rx_field_terminator = qr/\x1e/;
my $rx_subfield_delimiter = qr/\x1f/;
my $rx_struc_char = qr/[\x1d-\x1f]/;
my $rx_nonstruc_char = qr/[^\x1d-\x1f]/;
my %ecode2format = (
L05 => "Unrecognized record status: %s",
L06 => "Unrecognized record type: %s",
L07 => "Unrecognized bib level: %s",
L08 => "Unrecognized control type: %s",
L09 => "Unrecognized character encoding: %s",
L10 => "Invalid indicator count: %s",
L11 => "Invalid subfield code count: %s",
L17 => "Unrecognized encoding level: %s",
L18 => "Unrecognized cataloging form: %s",
M18 => "Unrecognized item info: %s",
L19 => "Unrecognized multipart resource record level: %s",
L20 => "Invalid length-of-field length: %s",
L21 => "Invalid length-of-offset length: %s",
L22 => "Invalid length-of-impldef length: %s",
L23 => "Invalid undefined value in leader: %s",
DLN => "Directory length: not a multiple of 12 bytes",
DNT => "Directory not terminated",
EFT => "Embedded field terminator in field %s",
ERT => "Embedded record terminator in field %s",
ETG => "Invalid field tag %s",
ESD => "Subfield delimiter in control field %s",
IIN => "Invalid indicator in field %s",
ISI => "Invalid subfield identifier %s in field %s",
IUT => "Invalid UTF-8 in field %s",
JAB => "Junk at beginning of field %s",
JAE => "Junk at end of field %s",
MSI => "Missing subfield identifier in field %s",
NSF => "Empty subfield %s in field %s",
NWF => "Not a USMARC record: pathological leader",
RLN => "Record length doesn't match the length encoded in the leader",
TRU => "Truncated field %s",
UFD => "Unterminated field %s",
);
my %rstat2label = (
'a' => 'increase in encoding level',
'c' => 'corrected or revised',
'd' => 'deleted',
'n' => 'new',
'p' => 'increase in encoding level from prepublication',
);
my ($show_error_details, $verbose, $parseable, $terse, $quiet, $check_utf8, $strict);
my $summarize = 1;
my %ignore;
my (%err, %errmsg, %enc, %bad);
my %rstat = ('c' => 0, 'd' => 0, 'n' => 0);
my $max_errs_in_record = 1;
my $max_errs_total = 1<<31;
GetOptions(
'h' => \&usage,
'l' => \&list_codes,
'r=i' => \$max_errs_in_record,
't=i' => \$max_errs_total,
'a' => sub { $max_errs_total = $max_errs_in_record = 1 << 31 },
'1' => sub { $max_errs_total = $max_errs_in_record = 1 },
'u' => \$check_utf8,
's' => \$strict,
'e' => \$show_error_details,
'x=s' => sub { $ignore{$_} = 1 for split /,/, $_[1] },
'v' => \$verbose,
'E' => \$terse,
'n' => sub { $summarize = 0 },
'p' => \$parseable,
'q' => \$quiet,
) or usage();
my ($file, $fh);
if (@ARGV == 1) {
$file = shift @ARGV;
open $fh, '<', $file
or fatal "Can't open input file $file: $!";
}
elsif (@ARGV == 0) {
$file = '<stdin>';
$fh = \*STDIN;
}
else {
usage();
}
binmode $fh;
binmode STDOUT;
my ($n, $rtype_bib, $rtype_mfhd, $rtype_auth, $rtype_other) = (0, 0, 0, 0, 0);
my $skipped = 0;
my $byte_pos = 0;
my $bibid;
my $printed;
my $errs_total = 0;
my $errs_in_record = 0;
$/ = RECORD_TERMINATOR;
RECORD:
while (defined(my $rec = <$fh>)) {
$n++;
$errs_in_record = 0;
undef $printed;
eval {
my ($rlen, $rstat, $rtype, $blvl, $ctype, $enc, $icount, $scount, $baddr, $elvl, $cform, $mrrl, $loflen, $ofslen, $implen, $undef) =
(
$rec =~ m{
\A
# bytes description
# ----- -------------------------------------------
(\d{5}) # 00-04 Rec length
(.) # 05 Record status
(.) # 06 Type of record
(.) # 07 Bibliographic Level
(.) # 08 Type of control
(.) # 09 Character coding
(.) # 10 Indicator count
(.) # 11 Subfield code count
(\d{5}) # 12-16 Base address = length of leader + directory
(.) # 17 Encoding level
(.) # 18 Descriptive cataloging form
(.) # 19 Multipart resource record level
(.) # 20 Length of the length-of-field portion
(.) # 21 Length of the starting-character-position portion
(.) # 22 Length of the implementation-defined portion
(.) # 23 Undefined
}x
);
$enc{$enc}++;
$rstat{$rstat}++;
my $leader = substr($rec, 0, 24);
if (!defined $rlen) {
error NWF => $leader;
next RECORD;
}
my $reclen = length $rec;
error RLN => $rlen, length($rec) if $rlen != length $rec;
if ($rtype =~ /[acdefgijkmoprt]/) {
$rtype_bib++;
error L05 => $rstat if $rstat !~ /[acdnp]/;
error L07 => $blvl if $blvl !~ /[abcdims]/;
error L17 => $elvl if $elvl !~ /[ 1234578uz]/;
error L18 => $cform if $cform !~ /[ aciu]/;
error L19 => $mrrl if $mrrl !~ /[ abc]/;
}
elsif ($rtype =~ /[uvxy]/) {
$rtype_mfhd++;
error L05 => $rstat if $rstat !~ /[cdn]/;
error L17 => $elvl if $elvl !~ /[12345muz]/;
error M18 => $cform if $cform !~ /[in]/;
}
elsif ($rtype eq 'z') {
$rtype_auth++;
}
else {
$rtype_other++;
error L06 => $rtype;
}
error L09 => $enc if $enc !~ /[ a]/;
error L10 => $icount if $icount ne '2';
error L11 => $scount if $scount ne '2';
error L20 => $loflen if $loflen ne '4';
error L21 => $ofslen if $ofslen ne '5';
error L22 => $implen if $implen ne '0';
error L23 => $undef if $undef ne '0';
my $directory = substr($rec, 24, $baddr - 24);
my $dirlen = length $directory;
my $dirend = substr($directory, -1, 1);
error DLN => $dirlen if $dirlen % 12 != 1;
error DNT => ord $dirend if $dirend ne FIELD_TERMINATOR;
warning("Record claims not to be UTF-8") if $check_utf8 && $enc ne 'a';
FIELD:
while ($directory =~ /(...)(....)(.....)/gc) {
my ($tag, $len, $ofs) = ($1, $2, $3);
my $value = substr($rec, $baddr + $ofs, $len);
error ERT => $tag if $value =~ /$rx_record_terminator./;
error EFT => $tag if $value =~ /$rx_field_terminator./;
error ETG => $tag if $tag =~ /[^A-Za-z0-9]/
|| $tag =~ /[a-z]/ && $tag =~ /[A-Z]/;
error UFD => $tag if substr($value, -1) ne FIELD_TERMINATOR;
error IUT => $tag if $check_utf8 && !check_utf8(\$value);
if ($tag lt '010') {
# Control field
error ESD => $tag if $value =~ $rx_subfield_delimiter;
$bibid = substr($value, 0, -1) if $tag eq '001';
}
else {
# Data field
if (length($value) < 2) {
error TRU => $tag;
next;
}
$value =~ s/^(.)(.)//;
error IIN => $tag if grep { $_ < 32 || $_ > 127 } map { ord $_ } ($1, $2);
my $num_subfields = 0;
error JAB => $tag, $1 if $value =~ s/^($rx_nonstruc_char+)//;
SUBFIELD:
while ($value =~ s/^$rx_subfield_delimiter($rx_nonstruc_char*)//g) {
my $svalue = $1;
if ($svalue !~ s/^(.)//) {
error MSI => $tag;
next;
}
my $id = $1;
if ($strict) {
error ISI => $id, $tag if $id !~ /[0-9a-z]/;
error NSF => $id if $svalue eq '';
}
}
error JAE => $tag unless $value eq FIELD_TERMINATOR;
}
}
};
}
continue {
$byte_pos += length($rec);
if (defined($max_errs_total) && $errs_total >= $max_errs_total) {
$skipped++ while <$fh>;
}
}
# --- Report results
if (!$quiet) {
my $total = $n + $skipped;
info sprintf('File contains %d %s', $total, plural($total, 'record(s)'));
info "Record types:",
sprintf('%8d bib', $rtype_bib),
sprintf('%8d MFHD', $rtype_mfhd),
sprintf('%8d authority', $rtype_auth),
sprintf('%8d other', $rtype_other),
;
my $enc_marc8 = delete $enc{' '} || 0;
my $enc_utf8 = delete $enc{'a'} || 0;
my $enc_other = 0; $enc_other += $_ for values %enc;
info "Character encodings:",
sprintf('%8d MARC-8', $enc_marc8),
sprintf('%8d UTF-8', $enc_utf8),
sprintf('%8d other', $enc_other),
;
my @rstat = ("Record statuses:");
foreach (sort keys %rstat) {
push @rstat, sprintf('%8d %s', $rstat{$_}, $rstat2label{$_} || "$_ [invalid]");
}
info @rstat;
if ($errs_total == 0) {
ok 'All records are valid';
}
else {
my $bad_recs = scalar keys %bad;
my $good_recs = $n - $bad_recs;
info sprintf('Total: %d %s detected', $errs_total, plural($errs_total, 'error(s)')),
'Record summary:',
sprintf('%8d valid', $good_recs),
sprintf('%8d invalid', $bad_recs),
sprintf('%8d skipped', $skipped);
if (defined($max_errs_total) && $errs_total == $max_errs_total) {
info 'Maximum number of errors reached';
}
summarize if $summarize;
fail "$file is not valid";
exit 3;
}
}
exit($errs_total > 0 ? 3 : 0);
# --- Functions
sub plural {
my ($num, $str) = @_;
# dog(s) --> dog | dogs
# child(ren) --> child | children
# stor(y|ies) -> story | stories
$str =~ s/\(([^()]*)\)$// or return $str;
my $sfx = $1;
my @pl = split '|', $sfx;
return $str . $pl[-1] if $num != 1;
return $str if @pl == 1;
return $str . $pl[0];
}
sub check_utf8 {
my ($strref) = @_;
return $$strref =~ m{
\A
(?:
[\x00-\x7f]
|
[\xc2-\xdf][\x80-\xbf]
|
[\xe0-\xef][\x80-\xbf]{2}
|
[\xf0-\xf4][\x80-\xbf]{3}
)*
\z
}xg;
}
sub error {
my ($ecode, @args) = @_;
return if $ignore{$ecode};
$bad{$n} = 1;
$err{$ecode}++;
$errmsg{$ecode}{join(','),@args}++;
$errs_total += @_;
$errs_in_record += @_;
if ($verbose) {
my $msg = sprintf "<%s> $ecode2format{$ecode}", $ecode, @args;
record;
emit '####' => $msg;
}
die if $errs_in_record > $max_errs_in_record
|| $errs_total > $max_errs_total;
}
sub warning {
if (!$quiet) {
record;
emit '????' => $_ for @_;
}
}
sub info {
return if $terse;
emit 'INFO' => $_ for @_;
}
sub record {
emit 'RECN' => sprintf("%d @ %d%s",
$n, $byte_pos, (defined $bibid ? " :: 001 = <$bibid>" : ''))
if !$printed++;
}
sub fatal {
emit 'EXIT' => "Fatal error: $_" for @_;
exit 2;
}
sub summ {
emit 'SUMM' => "@_" if $summarize;
}
sub ok {
emit 'OK' => "@_";
}
sub fail {
emit 'FAIL' => "@_";
}
sub summarize {
summ 'Error counts:';
foreach my $ecode (sort keys %err) {
my $fmt = $ecode2format{$ecode};
$fmt =~ s/:? %s//g;
summ sprintf('%8d %s %s', $err{$ecode}, $ecode, $fmt);
if ($show_error_details) {
summ sprintf(' |%8d = %s', $errmsg{$ecode}{$_}, $_) for sort keys %{ $errmsg{$ecode} };
}
}
}
sub emit {
if (!$parseable) {
shift;
printf STDERR "%s\n", @_;
}
else {
printf STDERR "%4.4s %s\n", @_;
}
}
sub usage {
print STDERR <<'EOS';
Usage: marcdiag [OPTION]... [FILE]
Options:
-r NUM Report no more than NUM errors in a record
-t NUM Report no more than NUM errors total
-x CODE Don't report CODE errors (e.g., RLN or L07)
-a Report all errors
-1 Stop after the first error
-u Check for invalid UTF-8 sequences
-s Be more strict when checking subfields
-p Produce parseable output
-v Be verbose (report every occurrence of an error)
-q Be quiet
-E Be terse (only report errors)
-e Show error details at the end
-n Don't print a summary at the end
-h Show this usage information
-l List error codes and descriptions
Exit status:
0 No errors
1 Usage error
2 Unable to open FILE
3 At least one error was detected
EOS
exit 1;
}
sub list_codes {
print STDERR "Error codes:\n";
printf STDERR "%s %s\n", $_, $ecode2format{$_} for sort keys %ecode2format;
exit 0;
}
|