[Lxr-commits] CVS: lxr/lib/LXR SimpleParse.pm,1.22,1.23

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/lxr/lxr/lib/LXR
In directory sfp-cvs-1.v30.ch3.sourceforge.com:/tmp/cvs-serv9031/lib/LXR

Modified Files:
	SimpleParse.pm 
Log Message:
SimpleParse.pm: fix parsing inaccuracies and try to optimise for speed (though not perfect to keep parsing correctness)

Index: SimpleParse.pm
===================================================================
RCS file: /cvsroot/lxr/lxr/lib/LXR/SimpleParse.pm,v
retrieving revision 1.22
retrieving revision 1.23
diff -u -d -r1.22 -r1.23

--- SimpleParse.pm	21 Sep 2013 12:54:52 -0000	1.22
+++ SimpleParse.pm	8 Nov 2013 08:27:24 -0000	1.23
@@ -42,20 +42,23 @@
 	&untabify
 	&init
 	&nextfrag
+	$dountab
 );
 
 # Global module variables
 
-my $fileh;       # File handle
-my @frags;       # Fragments in queue
-my @bodyid;      # Array of body type ids
-my @open;        # Fragment opening delimiters
-my @term;        # Fragment closing delimiters
-my @stay;        # Fragment maintaining current context
-my $split;       # Fragmentation regexp
-my $open;        # Fragment opening regexp
-my $continue;	 # Fragment maintaining current context for "no category"
-my $tabwidth;    # Tab width
+my $fileh;		# File handle
+my @frags;		# Fragments in queue
+my $next;		# Current fragment
+my @bodyid;		# Array of body type ids
+my @open;		# Fragment opening delimiters
+my @term;		# Fragment closing delimiters
+my @stay;		# Fragment maintaining current context
+my $split;		# Fragmentation regexp
+my $open;		# Fragment opening regexp
+my $continue;	# Fragment maintaining current context for "no category"
+my $tabwidth;   # Tab width
+our $dountab;	# Untabify flag (in nextfrag)
 
 
 =head2 C<init ($fileh, $tabhint, @blksep)>
@@ -89,18 +92,19 @@
 	my @blksep;
 
 	@frags    = ();
+	$next     = undef;
 	@bodyid   = ();
 	@open     = ();
 	@term     = ();
 	@stay     = ();
 	$split    = '';
 	$open     = '';
-	$continue = '';
-	$tabwidth = 8;
+	$continue = undef;
+	$dountab  = 1;
 	my $tabhint;
 
 	($fileh, $tabhint, @blksep) = @_;
-	$tabwidth = $tabhint // $tabwidth;
+	$tabwidth = $tabhint // 8;
 
 # Consider every specification in the order given
 	foreach my $s (@blksep) {
@@ -108,45 +112,23 @@
 		my $k = (keys(%$s))[0];
 		if ($k eq 'atom') {		# special case for uncategorised fragments
 			$continue = $$s{$k};
-		}
-		else {
+		} else {
 			#	Value is itself a reference to an array
-			my $v = @$s{$k};
+			my $v = $$s{$k};
 			push (@bodyid, $k);			# Category name
 			push (@open, $$v[0]);		# Open delimiter
-			if (defined($$v[1])) {
-				push (@term, $$v[1]);	# Closing delimiter
-			} else {
-				push (@term, undef);
-			}
-			if (defined($$v[2])) {
-				push (@stay, $$v[2]);	# Locking pattern
-			} else {
-				push (@stay, '');
-			}
+			push (@term, $$v[1]);	# Closing delimiter
+			push (@stay, $$v[2]);	# Locking pattern
 		}
 	}
 
-	# Replace the anchors with a Start_of_Line marker
-	# The markers are removed by sub markupfile before
-	# emiting HTML code
-	foreach (@open) {
-		$_ =~ s/^\^/\xFF/;
-	}
-	foreach (@term) {
-		$_ =~ s/^\^/\xFF/;
-	}
-	foreach (@stay) {
-		$_ =~ s/^\^/\xFF/;
-	}
-	$continue =~ s/^\^/\xFF/;
-
 	# Create the regexps to find any opening delimiter
 	foreach (@open) {
-		$open  .= "^($_)\$|";
+		$open  .= "($_)|";
 		$split .= "$_|";
 	}
 	chop($open);	# Remove the last (extraneous) bar
+	$open = '^[\xFF\n]*(?:'.$open.')$';	# Set the anchors
 	chop($split);	# Remove the last (extraneous) bar
 }
 
@@ -179,8 +161,8 @@
 sub untabify {
 	my $t = $_[1] || 8;
 
-	$_[0] =~ s/^(\t+)/(' ' x ($t * length($1)))/ge;                 # Optimize for common case.
-	$_[0] =~ s/([^\t]*)\t/$1.(' ' x ($t - (length($1) % $t)))/ge;
+	$_[0] =~ s/^(\t+)/(' ' x ($t * length($1)))/geo;	# Optimize for common case.
+	$_[0] =~ s/([^\t]*)\t/$1.(' ' x ($t - (length($1) % $t)))/geo;
 	return ($_[0]);
 }
 
@@ -235,46 +217,42 @@
 	my $frag  = undef;	# output buffer
 	my $term  = undef;	# closing delim pattern
 	my $stay  = $continue;	# lock pattern
-	my $line  = '';		# line buffer
+	my $change = $split;	# delimiter introducing a category change
 	# These initial values set the state for the "anonymous"
 	# default category (i.e. code). It is switched to another
 	# state if $next (the following characters to process)
 	# begins with a starting delimiter.
+	my $line;			# line buffer
+	my $opos;			# position of this delimiter
+	my $spos;			# position of a (conflicting?) "stay" delimimter
 
 # 		print "nextfrag called\n";
 
 	while (1) {
+		$next = shift(@frags) if !defined($next);
 
 		# read one more line if we have processed
 		# all of the previously read line
-		if ($#frags < 0) {
+		if (!$next) {
 			$line = $fileh->getline;
+		#	Exit loop on EOF returning the currently assembled region
+		#	or an undefined pair
+			last if !defined($line);
 			# Interpret an Emacs-style tab specification
-			if	(  $. <= 2		# Line # 1?
-				&& $line =~ m/^.*-[*]-.*?[ \t;]tab-width:[ \t]*([0-9]+).*-[*]-/
+			if	(  $. <= 2		# Line # 1 or 2?
+				&& $line =~ m/^.*-\*-.*?[ \t;]tab-width:[ \t]*([0-9]+).*-\*-/o
 				) {
 				if ($1) {	# make sure there really is a non-zero tabwidth
 					$tabwidth = $1;
 				}
 			}
-
 #			&untabify($line, $tabwidth); # We inline this for performance.
-				# Optimize for common case.
-			if (defined($line)) {
-				$line =~ s/^(\t+)/' ' x ($tabwidth * length($1))/ge;
-				$line =~ s/([^\t]*)\t/$1.(' ' x ($tabwidth - (length($1) % $tabwidth)))/ge;
-
-				$frags[0] = "\xFF" . $line;	# Add SOL marker
+			# Optimize for common case.
+			if ($dountab) {
+				$line =~ s/^(\t+)/' ' x ($tabwidth * length($1))/geo;
+				$line =~ s/([^\t]*)\t/$1.(' ' x ($tabwidth - (length($1) % $tabwidth)))/geo;
 			}
-		}
-
-#	Exit loop on EOF returning the currently assembled region
-#	or an undefined pair
-		last if $#frags < 0;
-
-		# skip empty fragments
-		if ($frags[0] eq '') {
-			shift(@frags);
+			$next = "\xFF" . $line;	# Add SOL marker
 		}
 
 #	If the specification defines a locking pattern (in $stay),
@@ -282,135 +260,99 @@
 #	only if the "stay" atom is located inside the present category.
 #	The test below is rather complicated because we rely on
 #	pattern matching, not LR parsing.
-#	1-	See if there is a "stay" atom in the line.
+#	1-	See if there is a terminator (either the closing delimiter
+#		if defined or any opening delimiter) in the line.
+#		If none, the whole line is made of a single category.
+#		Otherwise, note its position.
+#	2-	Loop on the presence of a "stay" atom in the line.
 #		If none, leave the loop.
-#	2-	Check the terminator (either the closing delimiter if
-#		defined or any opening delimiter).
 #	3-	If the "stay" atom is located after (i.e. at the right of)
 #		the closing delimiter, leave the loop.
 #	4-	The part up to and including the "stay" atom is shifted
-#		into the candidate fragment.
+#		into the candidate fragment and the position of the
+#		terminator is updated for the next iteration of the
+#		inner loop.
 #	The process is repeated until there is no more "stay" atoms
 #	in the correct range.
 
 		# check for "stay" atoms
-		my $next = shift(@frags);
-		if ($stay ne '') {
-			while ($next =~ m/$stay/) {
-			# Compute the position of the "stay" atom
-				$next =~ m/^(.*?)($stay)/s;
-				my $spos = undef;
-				if (defined($2)) {
-					$spos = length($1) || 0;
-				}
-				my $opos = undef;
+		if (defined($stay)) {
 			# Look for "term" or any "open delim" if not defined
-				my $change = $term // $split;
-				if ($next =~ m/$change/) {
+			$opos = undef;
+			while	(	!defined($opos)
+					&&	$next =~ m/$change/
+					) {
 			# Compute the position of the "end" delimiter
-					$next =~ m/^(.*?)($change)/s;
-					if (defined($2)) {
-						$opos = length($1) || 0 ;
+				$opos = $-[0];
+				while ($next =~ m/$stay/) {
+				# Compute the end position of the "stay" atom
+					$spos = $+[0];
+				# Compare positions and make decision
+					last if $-[0] > $opos;
+				# There is a "stay" atom, shift it into fragment
+					$frag .= substr($next, 0, $spos);
+					$next = substr($next, $spos);
+					$opos -= $spos;
+					if ($opos <= 0) {
+						$opos = undef;
+						last;
 					}
 				}
-			# Compare positions and make decision
-				last if (defined($opos) && ($spos > $opos));
-			# There is a "stay" atom, shift it into fragment
-				$next =~ s/^(.*?)($stay)//s;
-				$frag .= $1 . $2;
 			}
 		}
 
 #	Have we already started a region?
-		if (defined($frag)) {
+		if	(	defined($frag)				# something in output buffer?
+			&&	$frag !~ m/^[\xFF\n]*$/o	# not just newlines?
+			) {
 #	We already have something in the buffer.
 #	Is it a named category?
 #	Add to output buffer till we find a closing delimiter.
 #	Remember that "stay" constructs have been processed above.
 			if (defined($btype) && defined($term)) {
-				if ($next =~ m/$term/) {		# A close delim in this fragment?
-			# Next instruction group is 5.8 compatible but does
-			# not allow capture parenthesis in regexps
-			#		$next =~ m/^(.*?)($term)(.*)/s;
-			#		if ($3 ne '') {
-			#			unshift(@frags, $3);	# Requeue last part
-			#		}
-			#		$frag .= $1 . $2;
-			# This group contains 5.10 features and removes the
-			# above mentioned limitation
-					$next =~ m/^(?'HEAD'.*?$term)(?'TAIL'.*)/s;
-					if ($+{'TAIL'} ne '') {
-						unshift(@frags, $+{'TAIL'});	# Requeue last part
-					}
-					$frag .= $+{'HEAD'};
-			# End of group
-					last;						# We are done, terminator met
+				if ($next =~ m/$term/) {	# A close delim in this fragment?
+					$frag .= substr($next, 0, $+[0]);
+					$next = substr($next, $+[0]);
+					last;					# We are done, terminator met
 				}
 #	An anonymous region is in the buffer (it defaults to "code").
 #	This default region is left on any opening delimiter.
 			} else {
-				if ($next =~ m/^($split)/) {
-					unshift(@frags, $next);		# requeue block
-#					print "encountered open token while btype was $btype\n";
+					# Split at delimiter
+				if ($next =~ s/^(.*?)($split)//) {	# An open delim in this fragment?
+					unshift(@frags, $next) if $next ne '';	# Requeue last part
+					$frag .= $1;		# Stuff part before delim
+					$next = $2;			# Delimiter
 					last;
 				}
-				if ($next =~ m/$split/) {		# An open delim in this fragment?
-			# Next instruction group is 5.8 compatible but does
-			# not allow capture parenthesis in regexps
-			#		$next =~ m/^(.*?)($split)(.*)/s;
-			#		if ($3 ne '') {
-			#			unshift(@frags, $3);	# Requeue last part
-			#		}
-			#		unshift(@frags, $2);		# Requeue open delimiter
-			#		$next = $1
-			# This group contains 5.10 features and removes the
-			# above mentioned limitation
-					$next =~ m/^(?'HEAD'.*?)(?'OPEN'$split)(?'TAIL'.*)/s;
-					if ($+{'TAIL'} ne '') {
-						unshift(@frags, $+{'TAIL'});	# Requeue last part
-					}
-					unshift(@frags, $+{'OPEN'});		# Requeue open delimiter
-					$next = $+{'HEAD'}
-			# End of group
-				}
 			}
-			$frag .= $next;
+			$frag .= $next;				# Full fragment (no delim)
+			$next = undef;
 		} else {
 #	This begins a new region (output buffer empty).
 #	Stuff the sequence up to any opening delimiter or the complete
 #	input line if there is no delimiter in range.
-#			print "start of new fragment\n";
-			if ($next =~ m/$split/) {			# An open delim in this fragment?
-			# Next instruction group is 5.8 compatible but does
-			# not allow capture parenthesis in regexps
-			#	$next =~ m/^(.*?)($split)(.*)/s;	# Split fragment at first
-			#	if ($3 ne '') {
-			#		unshift(@frags, $3);		# Requeue last part
-			#	}
-			#	if ($1 ne '') {					# Choose which frag to process
-			#		unshift(@frags, $2);		# Queue delimiter
-			#		$frag = $1;
-			#	} else {
-			#		$frag = $2;
-			# This group contains 5.10 features and removes the
-			# above mentioned limitation
-				$next =~ m/^(?'HEAD'.*?)(?'OPEN'$split)(?'TAIL'.*)/s;	# Split fragment at first
-				if ($+{'TAIL'} ne '') {
-					unshift(@frags, $+{'TAIL'});	# Requeue last part
-				}
-				if ($+{'HEAD'} ne '') {			# Choose which frag to process
-					unshift(@frags, $+{'OPEN'});	# Queue delimiter
-					$frag = $+{'HEAD'};
+# 			print "start of new fragment\n";
+			if ($next =~ s/^(.*?)($split)//) {	# An open delim in this fragment?
+				if ($1 ne '') {			# Anything before the delim?
+					unshift(@frags, $next) if $next ne '';	# Requeue last part
+					$next = $2;			# Delimiter
+					$frag .= $1;		# Stuff part before delim
+					last if $frag !~ m/^[\xFF\n]*$/o;
+					$frag .= $next;		# Fragment was "empty"
+					$next = undef;
 				} else {
-					$frag = $+{'OPEN'};
+					$frag .= $2;
+					$next = undef if $next eq '';
 				}
-			# End of group
-			} else {								# Full fragment (no delim)
-				$frag = $next;
+			} else {					# Full fragment (no delim)
+				$frag .= $next;
+				$next = undef;
 			}
 #	Find the blocktype of the current block
-			if (defined($frag) && (@_ = $frag =~ m/$open/)) {
-#				print "hit:$frag\n";
+# 			if (defined($frag) && (@_ = $frag =~ m/$open/)) {
+			if (@_ = $frag =~ m/$open/) {
+# 				print "hit:$frag\n";
 				# grep in a scalar context returns the number of times
 				# EXPR evaluates to true, which is this case will be
 				# the index of the first defined element in @_.
@@ -422,14 +364,18 @@
 					last;
 				} else {
 #	Set the category characteristics for further parsing
-					$term = $term[$btype];
-					$stay = $stay[$btype];
-				}
+					$term	= $term[$btype];
+					if ('CODE' eq ref($term)) {
+						$term = eval(&$term());
+					}
+					$stay	= $stay[$btype];
+					$change	= $term // $split;
+ 				}
 			}
 		}
 	}
 	$btype = $bodyid[$btype] if defined($btype);
-
+	$frag =~ s/\xFF//go;	# Remove start of line markers
 	return ($btype, $frag);
 }
 
@@ -466,9 +412,8 @@
 =cut
 
 sub requeuefrag {
-	my $frag = shift;
-
-	unshift(@frags, $frag);	# Requeue fragment
+	unshift(@frags, $next) if defined($next);	# Requeue fragment
+	$next = $_[0];
 }
 
 1;