#!perl -w use strict; use XML::Twig; use Getopt::Long; use Pod::Usage; use File::Temp qw{tempfile}; my $DEFAULT_SC = 'aspell -c'; my $DEFAULT_PP = 'indented'; my $DEFAULT_EXT= '.bak'; my $VERSION="0.02"; my ( $spellchecker, $ext, $attributes, $exclude_elements, $include_elements, $pretty_print, $version, $help, $man); GetOptions( 'spellchecker=s' => \$spellchecker, 'backup-extension=s' => \$ext, 'attributes' => \$attributes, 'exclude_elements=s' => \$exclude_elements, 'include_elements=s' => \$include_elements, 'pretty_print:s' => \$pretty_print, 'version' => \$version, 'help' => \$help, 'man' => \$man, ) or pod2usage(-verbose => 1, -exitval => -1); pod2usage( -verbose => 1, -exitval => 0) if $help; pod2usage( -verbose => 2, -exitval => 0) if $man; if( $version) { print "$0 version $VERSION\n"; exit;} # option processing $spellchecker ||= $DEFAULT_SC; $ext ||= $DEFAULT_EXT; if( $exclude_elements && $include_elements) { die "cannot use both --exclude-elements and --include-elements\n"; } if( defined $pretty_print and !$pretty_print) { $pretty_print= $DEFAULT_PP; } my %twig_options; my( %include_elements); if( $exclude_elements) { my @exclude_elts = split /\s+/, $exclude_elements; my %start_tag_handlers= map { $_ => \&exclude_elt } @exclude_elts; $twig_options{start_tag_handlers}= \%start_tag_handlers; } if( $include_elements) { my @include_elts = split /\s+/, $include_elements; my %start_tag_handlers= map { $_ => \&include_elt } @include_elts; $twig_options{start_tag_handlers}= \%start_tag_handlers; } $twig_options{pretty_print}= $pretty_print if( $pretty_print); foreach my $file (@ARGV) { my $id=0; my $id2elt={}; # id => element my( $tmp_fh, $tmp_file) = tempfile( "xml_spellcheck_XXXX", SUFFIX => '.txt' ); my $t= XML::Twig->new( keep_encoding =>1, %twig_options,); $t->parsefile( $file); foreach my $elt ($t->descendants( '#TEXT')) { if( (!$include_elements and !$exclude_elements) or ($include_elements and $elt->inherit_att( '#include')) or ($exclude_elements and !$elt->inherit_att( '#exclude')) ) { $id++; process_text( $t, $elt, $id, $id2elt, $tmp_fh) } } close $tmp_fh; system( "$spellchecker $tmp_file") ==0 or die "$spellchecker $tmp_file failed: $?"; open( $tmp_fh, "<$tmp_file") or die "cannot open temp file $tmp_file: $!"; while( <$tmp_fh>) { chomp; my( $id, $text)= split /:/, $_, 2; my $wrap= $id2elt->{$id}; $text=~ s{<\\n>}{\n}g; my $text_elt= $wrap->first_child or die "internal error 100\n"; if( $text_elt->gi eq '#PCDATA') { $text_elt->set_pcdata( $text); } elsif( $text_elt->gi eq '#CDATA') { $text_elt->set_cdata( $text); } else { die "internal error 101\n"; } $wrap->erase; } close $tmp_fh; rename( $file, "$file$ext") or die "cannot save backup file $file$ext: $!"; open( FILE, ">$file") or die "cannot save spell checked file $file: $!"; $t->print( \*FILE); close FILE; } sub include_elt { $_->set_att( '#include' => 1) ; } sub exclude_elt { $_->set_att( '#exclude' => 1) ; } sub process_text { my( $t, $elt, $id, $id2elt, $tmp_fh)= @_; my $wrap= $elt->wrap_in( '#SC'); #$wrap->set_att( '#ID' => $id); $id2elt->{$id}= $wrap; my $text= $elt->text; $text=~ s{\n}{<\\n>}g; print $tmp_fh "$id:$text\n"; } __END__ =head1 NAME xml_spellcheck - spellcheck XML files =head1 SYNOPSIS xml_spellcheck [options] =head1 DESCRIPTION xml_spellcheck lets you spell check the content of an XML file. It extracts the text (the content of elements and optionally of attributes), call a spell checker on it and then recreates the XML document. =head1 OPTIONS Note that all options can be abbreviated to the first letter =over 4 =item --conf Gets the options from a configuration file. NOT IMPLEMENTED YET. =item --spellchecker The command to use for spell checking, including any option By default C is used =item --backup-extension By default the original file is saved with a C<.bak> extension. This option changes the extension =item --attributes Spell check attribute content. By default attribute values are NOT spell checked. NOT YET IMPLEMENTED =item --exclude_elements A list of elements that should not be spell checked =item --include_elements A list of elements that should be spell checked (by default all elements are spell checked). C<--exclude_elements> and C<--include_elements> are mutually exclusive =item --pretty_print A pretty print style for the document, as defined in XML::Twig. If the option is provided without a value then the C style is used =item --version Dislay the tool version and exit =item --help Display help message and exit =item --man Display longer help message and exit =back =head1 EXAMPLES =head1 BUGS =head1 TODO =over 4 =item --conf option =item --attribute option =back =head1 PRE-REQUISITE XML::Twig, Getopt::Long, Pod::Usage, File::Temp XML::Twig requires XML::Parser. =head1 SEE ALSO XML::Twig =head1 COPYRIGHT AND DISCLAIMER This program is Copyright 2003 by Michel Rodriguez This program is free software; you can redistribute it and/or modify it under the terms of the Perl Artistic License or the GNU General Public License as published by the Free Software Foundation either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MER- CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. If you do not have a copy of the GNU General Public License write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. =head1 AUTHOR Michel Rodriguez xml_spellcheck is available at http://www.xmltwig.com/xmltwig/