From: <dg...@su...> - 2009-01-16 12:59:12
|
Author: bellmich Date: Fri Jan 16 13:58:20 2009 New Revision: 157 URL: http://libwbxml.opensync.org/changeset/157 Log: added script for XML normalization Added: wbxmlTestSuite/trunk/normalize_xml.pl Modified: wbxmlTestSuite/trunk/launchTests.sh Modified: wbxmlTestSuite/trunk/launchTests.sh ============================================================================== --- wbxmlTestSuite/trunk/launchTests.sh Wed Jan 14 14:47:37 2009 (r156) +++ wbxmlTestSuite/trunk/launchTests.sh Fri Jan 16 13:58:20 2009 (r157) @@ -10,15 +10,18 @@ XML2WBXML=`which xml2wbxml` fi -if [ " ${XMLLINT_PROGRAM}" != " " ] +if [ " ${PERL_PROGRAM}" != " " ] then - XMLLINT_BIN="${XMLLINT_PROGRAM}" + PERL_BIN="${PERL_PROGRAM}" else - XMLLINT_BIN=`which xmllint` + PERL_BIN=`which perl` fi -if [ " ${DIFF_BIN}" != " " ] + +NORMALIZE_SCRIPT="${CMAKE_CURRENT_BINARY_DIR}/normalize_xml.pl" + +if [ " ${DIFF_PROGRAM}" != " " ] then - DIFF_BIN="${DIFF_BIN}" + DIFF_BIN="${DIFF_PROGRAM}" else DIFF_BIN=`which diff` fi @@ -148,16 +151,14 @@ # compare original and generated XML echo -n "Comparing the original and the generated XML ... " - if [ " $XMLLINT_BIN" = " " -o " $DIFF_BIN" = " " ] + if [ " $PERL_BIN" = " " -o " $DIFF_BIN" = " " ] then echo UNSUPPORTED else - # (first we strip the comments from the original file) - awk '/<!--/,/-->/{next}1' $j > ${OUT_XML}.org - xmllint -format ${OUT_XML}.org > ${OUT_XML}.org0 - xmllint -format ${OUT_XML} > ${OUT_XML}.new0 - DIFF_RESULT=`diff ${OUT_XML}.org0 ${OUT_XML}.new0` - if [ " ${DIFF_RESULT}" != " " ]; + $PERL_BIN $NORMALIZE_SCRIPT --delete-attribute xmlns $j $OUT_XML.org + $PERL_BIN $NORMALIZE_SCRIPT --delete-attribute xmlns $OUT_XML $OUT_XML.new + DIFF_RESULT=`$DIFF_BIN -b $OUT_XML.org $OUT_XML.new` + if [ " $DIFF_RESULT" != " " ]; then echo FAILED RESULT="FAILED"; Added: wbxmlTestSuite/trunk/normalize_xml.pl ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ wbxmlTestSuite/trunk/normalize_xml.pl Fri Jan 16 13:58:20 2009 (r157) @@ -0,0 +1,325 @@ +#!@PERL_PROGRAM@ + +use strict; +use warnings FATAL => qw( all ); +use English; + +# check params +# 1. original file +# 2. new file +# 3. no more params + +my $ignore_attribute = ""; +if ($ARGV[0] eq "--delete-attribute") { + shift; + die "The attribute which must be deleted is missing." + if (not $ARGV[0]); + $ignore_attribute = shift; +} + +die "There must be two arguments (old and new file)." + if (scalar(@ARGV) != 2); + +my $org_filename = $ARGV[0]; +my $new_filename = $ARGV[1]; + +die "The original file does not exist." + if (not $org_filename or not -e $org_filename); + +die "The new file is not valid filename." + if (not $new_filename); + +die "The new file exists already." + if (-e $new_filename); + +# open files + +die "The original file is not readable." + if (not open(my $ORG_FD, "<", $org_filename)); + +die "The new file is not writeable." + if (not open(my $NEW_FD, ">", $new_filename)); + +# look for the XML tree +# - version and encoding +# - DTD +# - XML tree + +my $line = <$ORG_FD>; + +my $state = "IGNORE"; +do { + ## determine state + + ## XML detection + if ($state eq "IGNORE" and $line =~ q{^\s*<[a-zA-Z]}) { + $state = "XML_TREE"; + } + + ## version and encoding detection + if ($state eq "IGNORE" and $line =~ q{^\s*<\?}) { + $state = "ENC_OPEN"; + } + if ($state eq "ENC_OPEN" and $line =~ q{\?>\s*$}) { + $state = "WRITE"; + } + + ## DTD detection + if ($state eq "IGNORE" and $line =~ q{^\s*<!DOCTYPE}) { + $state = "DTD_OPEN"; + } + if ($state eq "DTD_OPEN" and $line =~ q{>\s*$}) { + $state = "WRITE"; + } + + ## comment detection + if ($state eq "IGNORE" and $line =~ q{^\s*<!--}) { + $state = "COMMENT_OPEN"; + } + if ($state eq "COMMENT_OPEN" and $line =~ q{-->\s*$}) { + $state = "IGNORE"; + } + + ## handle data + if ($state eq "IGNORE") { + $line = <$ORG_FD>; + } + if ($state eq "WRITE") { + print $NEW_FD $line; + $line = <$ORG_FD>; + $state = "IGNORE"; + } + if ($state =~ q{_OPEN$}) { + $line .= <$ORG_FD>; + $line =~ s{\s*[\n\r]+\s*}{ }sg; + $line .= "\n"; + } +} while ($state ne "XML_TREE"); + +# XML tree state +# - element +# < +# element name +# blank +# attribute name +# = attribute assignment +# attribute value +# > +# - data +# - comment + +my $indent = 0; +my $char = ""; +$state = "NEUTRAL"; +while (1) { + ## $line works line oriented + ## but the parser works character oriented + last if (not defined $line); + if (length ($line) == 0) { + $line = <$ORG_FD>; + last if (not defined $line or length($line) < 1); + } + $char = substr($line, 0, 1); + $line = substr($line, 1); + + # check state + + # reset text state + if ($state eq "TEXT_NEWLINE" and $char !~ q{[\s\n\r]}) { + $state = "NEUTRAL"; + } + + # ignore leading blanks (normalization) + if ($state eq "NEUTRAL" and $char =~ q{\s}) { + next; + } + + # handle comment or element which starts in a text line + if ($state eq "TEXT" and $char eq "<") { + print $NEW_FD "\n"; + $state = "NEUTRAL"; + } + + # try to handle a new comment or element + if ($state eq "NEUTRAL" and $char eq "<") { + # let's look forward (element or comment) + die "A standalone smaller than "<" sign is not allowed in XML." + if (length($line) < 1); + $char = substr($line, 0, 1); + $line = substr($line, 1); + if ($char eq "!") { + ## this should be a comment + $char = substr($line, 0, 2); + die "A comment must always start with '<!--' and not with '<!${char}'." + if ($char ne "--"); + $line = substr($line, 2); + $state = "COMMENT"; + } elsif ($char =~ q{[a-zA-Z]}) { + ## this is an element + for (my $i = 0; $i < $indent; $i++) { + print $NEW_FD " "; + } + $indent++; + print $NEW_FD "<".$char; + $state = "ELEMENT"; + } elsif ($char eq "/") { + ## this is a closing element + $indent--; + for (my $i = 0; $i < $indent; $i++) { + print $NEW_FD " "; + } + print $NEW_FD "</"; + $state = "ELEMENT"; + } else { + ## this is illegal + die "A smaller than "<" sign must be a tag or a comment."; + } + next; + } + + # ignore comments + if ($state eq "COMMENT") { + if ($char eq "-" and substr($line, 0, 2) eq "->") { + # end of comment + $line = substr($line, 2); + $state = "NEUTRAL"; + } + next; + } + + # read and write text data + if ($state eq "NEUTRAL" and $char ne "<") { + # new text data + for (my $i = 0; $i < $indent; $i++) { + print $NEW_FD " "; + } + print $NEW_FD $char; + $state = "TEXT"; + next; + } + if ($state eq "TEXT" and $char eq "\r") { + next; + } + if ($state eq "TEXT" and $char eq "\n") { + print $NEW_FD "\n"; + $state = "TEXT_NEWLINE"; + next; + } + # if & is not handled as & then this error cannot be recovered + if ($state eq "TEXT" and $char eq '"') { + print $NEW_FD """; + next; + } + if ($state eq "TEXT" and $char eq "'") { + print $NEW_FD "'"; + next; + } + if ($state eq "TEXT" and $char eq '<') { + print $NEW_FD "<"; + next; + } + if ($state eq "TEXT" and $char eq '>') { + print $NEW_FD ">"; + next; + } + if ($state eq "TEXT") { + # + print $NEW_FD $char; + next; + } + if ($state eq "TEXT_NEWLINE" and $char =~ q{\s\r\n}) { + next; + } + + # read element name + if ($state eq "ELEMENT" and $char =~ q{[a-zA-Z0-9_]}) { + print $NEW_FD $char; + next; + } + + # detect space for potential attribute + if ($state eq "ELEMENT" and $char =~ q{[\s\r\n]}) { + $state = "POTENTIAL_ATTRIBUTE"; + next; + } + + # detect and read attribute + if ($state eq "POTENTIAL_ATTRIBUTE" and $char =~ q{[a-zA-Z0-9_]}) { + if ($ignore_attribute and + substr($ignore_attribute, 0, 1) eq $char and + substr($ignore_attribute, 1) eq substr($line, 0, length(substr($ignore_attribute, 1)))) + { + # let's ingore the attribute + $line =~ s{^[a-zA-Z_1-9]*=}{}; + if (substr($line, 0, 1) eq "'") { + $line =~ s{^'[^']*'}{}; + } else { + $line =~ s{^"[^"]*"}{}; + } + next; + } + $state = "ATTRIBUTE_NAME"; + print $NEW_FD " "; + } + if ($state eq "ATTRIBUTE_NAME" and $char =~ q{[a-zA-Z0-9_]}) { + print $NEW_FD $char; + next; + } + if ($state eq "ATTRIBUTE_NAME" and $char eq "=") { + print $NEW_FD $char; + $state = "ATTRIBUTE_ASSIGN"; + next; + } + if ($state eq "ATTRIBUTE_ASSIGN" and $char eq '"') { + print $NEW_FD $char; + $state = "ATTRIBUTE_VALUE_QUOT"; + next; + } + if ($state eq "ATTRIBUTE_VALUE_QUOT" and $char ne '"') { + print $NEW_FD $char; + next; + } + if ($state eq "ATTRIBUTE_VALUE_QUOT" and $char eq '"') { + print $NEW_FD $char; + $state = "POTENTIAL_ATTRIBUTE"; + next; + } + if ($state eq "ATTRIBUTE_ASSIGN" and $char eq "'") { + print $NEW_FD $char; + $state = "ATTRIBUTE_VALUE_APOS"; + next; + } + if ($state eq "ATTRIBUTE_VALUE_APOS" and $char ne "'") { + print $NEW_FD $char; + next; + } + if ($state eq "ATTRIBUTE_VALUE_APOS" and $char eq "'") { + print $NEW_FD $char; + $state = "POTENTIAL_ATTRIBUTE"; + next; + } + + # handle standalone element + if (($state eq "ELEMENT" or $state eq "POTENTIAL_ATTRIBUTE") + and $char eq "/") + { + print $NEW_FD "/"; + $indent--; + next; + } + + # read element end + if (($state eq "ELEMENT" or $state eq "POTENTIAL_ATTRIBUTE") + and $char eq ">") + { + print $NEW_FD $char."\n"; + $state = "NEUTRAL"; + next; + } + +}; + +# close files + +close $ORG_FD; +close $NEW_FD; |