Re: [Phpwiki-talk] Re: info page translations & WikiLinkWithSameStem

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

On Tuesday October 24, aho...@in... wrote:
> 
> 
> > I'll maybe have a quick look at these too...
> > 7.And what about a WikiLink? and another WikiLink?WithSameStem on one line?
> > 8.Other way around: WikiLink?WithSameStem plus WikiLink?
> 

I would like to suggest the following patch (against current CVS)
which takes a different approach which makes this sort of bug much
less likely.  To me, it makes the code a whole lot cleaner, but may
it's just me.

The basic approach is to cope the line up while matching rather than
extracting strings and then searching for those strings.

It also fixes a bug whereby

;WikiWord: definition there-of
;OtherWikiWord: other definition

gets converted into bad html because the WikiWord gets converted into
a full reference (containing a :) before the search for /^;+.*?:/ is
done.
I simple delay replacing the token until (nearly) the last moment.

Let me know what yo think?

NeilBrown

Index: lib/transform.php
===================================================================
RCS file: /cvsroot/phpwiki/phpwiki/lib/transform.php,v
retrieving revision 1.4
diff -u -r1.4 transform.php

--- lib/transform.php	2000/10/24 10:32:37	1.4
+++ lib/transform.php	2000/10/25 01:58:48
@@ -1,5 +1,23 @@
 <!-- $Id: transform.php,v 1.4 2000/10/24 10:32:37 ahollosi Exp $ -->
 <?php
+   function tokenize($str, $pattern, &$orig, &$ntokens) {
+      global $FieldSeparator;
+      // Find any strings in $str that patch $pattern and
+      // store them in $tokens[], replacing them with a token
+      $new = "";      
+      while (preg_match("/^(.*?)($pattern)/", $str, $matches)) {
+         $linktoken = $FieldSeparator . $FieldSeparator . ($ntokens++) . $FieldSeparator;
+         $new .= $matches[1].$linktoken;
+	 $orig[] = $matches[2];
+         $str = substr($str, strlen($matches[0]));
+      }
+      $new .= $str;
+      return $new;
+   }
+
+
+
+
    // expects $pagehash and $html to be set
 
    // Set up inline links and images
@@ -29,6 +47,7 @@
       unset($tokens);
       unset($replacements);
       $ntokens = 0;
+      $replacements = array();
       
       $tmpline = $pagehash["content"][$index];
 
@@ -54,70 +73,40 @@
       //////////////////////////////////////////////////////////
       // New linking scheme: links are in brackets. This will
       // emulate typical HTML linking as well as Wiki linking.
+	
+      // First need to protect [[. 
+      $tmpline = tokenize($tmpline, "\[\[", $replacements, $ntokens);
+
 
-      // match anything between brackets except only numbers
-      // trying: 
-      $numBracketLinks = preg_match_all("/\[.+?\]/", $tmpline, $brktlinks);
-      /* On 12 Jul,2000 Jeff <da...@da...> adds:
-       *
-       * Simple sorting doesnt work, since (in ASCII) '[' comes between
-       * the upper- and lower-case characters.
-       *
-       * Using sort "[[Link] [Link]" will come out wrong, using
-       * rsort "[[link] [link]" will come out wrong.
-       * (An appropriate usort would work.)
-       *
-       * I've added a look-behind assertion to the preg_replace which,
-       * I think, fixes the problem.  I only hope that all PHP versions
-       * support look-behind assertions....
-      // sort instead of rsort or "[[link] [link]" will be rendered wrong.
-      sort($brktlinks[0]);
-      reset($brktlinks[0]);
-       */
-
-      for ($i = 0; $i < $numBracketLinks; $i++) {
-         $brktlink = preg_quote($brktlinks[0][$i]);
-         $linktoken = $FieldSeparator . $FieldSeparator . ++$ntokens . $FieldSeparator;
-	 /* PS:
-	  * If you're wondering about the double $FieldSeparator,
-	  * consider what happens to (the admittedly sick):
-	  *   "[Link1] [Link2]1[Link3]"
-	  *
-	  * Answer: without the double field separator, it gets
-	  *  tokenized to "%1% %2%1%3%" (using % to represent $FieldSeparator),
-	  *  which will get munged as soon as '%1%' is substituted with it's
-	  *  final value.
-	  */
-         $tmpline = preg_replace("|(?<!\[)$brktlink|",
-                                 $linktoken,
-                                 $tmpline);
-
-	 $tokens[] = $linktoken;
-	 $link = ParseAndLink($brktlinks[0][$i]);
-         $replacements[] = $link['link'];
+      // Now process the [\d+] links which are numeric references	
+      $oldn = $ntokens;
+      $tmpline = tokenize($tmpline, "\[\d+\]", $replacements ,$ntokens);
+      while ($oldn < $ntokens) {
+	    preg_match("/\[(\d+)\]/", $replacements[$oldn], $m);
+	    $num = $m[1];
+            if (! empty($embedded[$num])) {
+               $replacements[$oldn] = $embedded[$num];
+            }
+	    $oldn++;
+      }
+      // match anything else between brackets 
+
+      $oldn = $ntokens;
+      $tmpline = tokenize($tmpline, "\[.+?\]", $replacements, $ntokens);
+      while ($oldn < $ntokens) {
+	$link = ParseAndLink($replacements[$oldn]);	
+	$replacements[$oldn] = $link['link'];
+	$oldn++;
       }
 
       //////////////////////////////////////////////////////////
       // replace all URL's with tokens, so we don't confuse them
       // with Wiki words later. Wiki words in URL's break things.
-
-      $hasURLs = preg_match_all("/\b($AllowedProtocols):[^\s\<\>\[\]\"'\(\)]*[^\s\<\>\[\]\"'\(\)\,\.\?]/", $tmpline, $urls);
 
-      // have to sort, otherwise errors creep in when the domain appears
-      // in two consecutive URL's on the same line, but the second is
-      // longer e.g. http://c2.com followed by http://c2.com/wiki 
-      rsort($urls[0]);
-      reset($urls[0]);
-
-      for ($i = 0; $i < $hasURLs; $i++) {
-         $inplaceURL = preg_quote($urls[0][$i]);
-         $URLtoken = $FieldSeparator . $FieldSeparator . ++$ntokens . $FieldSeparator;
-         $tmpline = preg_replace("|$inplaceURL|",
-                                 $URLtoken,
-                                 $tmpline);
-
-	 $tokens[] = $URLtoken;
-         $replacements[] = LinkURL($urls[0][$i]);
+      $tmpline = tokenize($tmpline, "\b($AllowedProtocols):[^\s\<\>\[\]\"'\(\)]*[^\s\<\>\[\]\"'\(\)\,\.\?]", $replacements, $ntokens);
+      while ($oldn < $ntokens) {
+	$replacements[$oldn] = LinkURL($replacements[$oldn]);
+        $oldn++;
       }
 
       // escape HTML metachars
@@ -154,56 +143,20 @@
 
       // Link Wiki words
       // Wikiwords preceeded by a '!' are not linked
-      if (preg_match_all("#!?\b(([A-Z][a-z]+){2,})\b#",
-                         $tmpline, $link)) {
-         // uniq the list of matches
-         unset($hash);
-         for ($i = 0; $link[0][$i]; $i++) {
-	    if(strstr($link[0][$i], '!'))	// hashval sports a value
-	       $hashval = "0000:".$link[0][$i];	// in front that guarantees
-	    else				// correct sorting
-	       $hashval = sprintf("%04d:%s", 9876-strlen($link[0][$i])
-					  , $link[0][$i]);
-            $hash[$hashval] = 1;
-         }
-
-	 // all '!WikiName' entries are sorted first
-         ksort($hash);
-         while (list($realfile, $val) = each($hash)) {
-	    $realfile = substr($realfile, 5);	// get rid of sort value
-	    $token = $FieldSeparator . $FieldSeparator . ++$ntokens . $FieldSeparator;
-	    $tmpline = str_replace($realfile, $token, $tmpline);
-
-	    $tokens[] = $token;
-	    if (strstr($realfile, '!')) {
-	       $replacements[] = substr($realfile, 1);
-	    }	       
-            elseif (IsWikiPage($dbi, $realfile)) {
-	       $replacements[] = LinkExistingWikiWord($realfile);
-            } else {
-	       $replacements[] = LinkUnknownWikiWord($realfile);
-            }
-         }
+      $oldn = $ntokens;
+      $tmpline = tokenize($tmpline, "!?\b(([A-Z][a-z]+){2,})\b", $replacements, $ntokens);
+      while ($oldn < $ntokens) {
+        $old = $replacements[$oldn];
+        if (substr($old,0,1)=='!') {
+	  $replacements[$oldn] = substr($old,1);
+	} elseif (IsWikiPage($dbi, $old)) {
+	  $replacements[$oldn] = LinkExistingWikiWord($old);
+	} else {
+	  $replacements[$oldn] = LinkUnknownWikiWord($old);
+	}
+	$oldn++;
       }
 
-      ///////////////////////////////////////////////////////
-      // Replace tokens
-      for ($i = 0; $i < $ntokens; $i++)
-	  $tmpline = str_replace($tokens[$i], $replacements[$i], $tmpline);
-      
-
-      // match and replace all user-defined links ([1], [2], [3]...)
-      preg_match_all("|\[(\d+)\]|", $tmpline, $match);
-      if (count($match[0])) {
-         for ($k = 0; $k < count($match[0]); $k++) {
-            if (! empty($embedded[$match[1][$k]])) {
-               $linkpattern = preg_quote($match[0][$k]);
-               $tmpline = preg_replace("|$linkpattern|",
-                                       $embedded[$match[1][$k]],
-                                       $tmpline);
-            }
-         }
-      }
 
       // HTML modes: pre, unordered/ordered lists, term/def  (using TAB)
       if (preg_match("/(^\t+)(.*?)(:\t)(.*$)/", $tmpline, $matches)) {
@@ -279,6 +232,11 @@
          // it's ordinary output if nothing else
          $html .= SetHTMLOutputMode("", ZERO_DEPTH, 0);
       }
+
+      ///////////////////////////////////////////////////////
+      // Replace tokens
+      for ($i = 0; $i < $ntokens; $i++)
+	  $tmpline = str_replace($FieldSeparator.$FieldSeparator.$i.$FieldSeparator, $replacements[$i], $tmpline);
 
       $tmpline = str_replace("%%Search%%", $quick_search_box, $tmpline);
       $tmpline = str_replace("%%Fullsearch%%", $full_search_box, $tmpline);