[Plplot-devel] Re: Unicode and plplot

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Hi,

     I understand what Per is on about, and it's important.

     I have been struggling with x24c as well.  The problem is that our
unicode implementation currently requires that you have libunicode
installed.  If you don't have libunicode (like on my system presently),
then a vital section of code in plcore.c that translates utf8 to ucs4
does not get executed.  ACK!!

     Per has provided us with a nice bit of code that allows us to avoid
libunicode altogether.  I have made a few adjustments to it, and have
removed the unnecessary section invoking libunicode from plcore.c.  Look
for the block beginning with the words "Per Persson".  Note that I
changed the indentation for all of plP_text because I just couldn't deal
with it anymore. ;^)

     I have tested the results for many examples (including x24c) on
the png and gcw drivers, and they appear to be good.  I would like to hear
if others accept this patch.  If so, then maybe we can apply it for
tomorrow's release.  It fixes a *nasty* bug.  Please advise.

Cheers,
Tom

--

--- plcore.c	2005-05-12 14:14:24.000000000 -0300
+++ plcore.c.new	2005-05-12 14:05:21.000000000 -0300
@@ -470,262 +470,307 @@
 	 PLINT refx, PLINT refy, const char *string)
 {

-   if (plsc->dev_text) { /* Does the device render it's own text ? */
-      EscText args;
-      short len=0;
-      char skip;
-      unsigned short i,j, k;
-      PLUNICODE code;
-      char esc;
-      int idx;
-
-      args.base = base;
-      args.just = just;
-      args.xform = xform;
-      args.x = x;
-      args.y = y;
-      args.refx = refx;
-      args.refy = refy;
-      args.string = string;
-
-      if (plsc->dev_unicode) { /* Does the device also understand unicode? */
-	 PLINT ig;
-	 PLUNICODE fci, fcisave;
-	 unsigned char hexdigit, hexpower;
-
-	 PLINT clpxmi, clpxma, clpymi, clpyma;
-
-	 /* Now process the text string */
-
-	 if (string!=NULL) {        /* If the string isn't blank, then we will
-                                     * continue
-				     */
-
-	    len=strlen(string);     /* this length is only used in the loop
-				     * counter, we will work out the length of
-				     * the unicode string as we go */
-	    plgesc(&esc);
+  if (plsc->dev_text) { /* Does the device render it's own text ? */
+    EscText args;
+    short len=0;
+    char skip;
+    unsigned short i,j, k;
+    PLUNICODE code;
+    char esc;
+    int idx;
+
+    args.base = base;
+    args.just = just;
+    args.xform = xform;
+    args.x = x;
+    args.y = y;
+    args.refx = refx;
+    args.refy = refy;
+    args.string = string;
+
+    if (plsc->dev_unicode) { /* Does the device also understand unicode? */
+      PLINT ig;
+      PLUNICODE fci, fcisave;
+      unsigned char hexdigit, hexpower;
+
+      PLINT clpxmi, clpxma, clpymi, clpyma;
+
+      /* Now process the text string */
+
+      if (string!=NULL) {        /* If the string isn't blank, then we will
+				  * continue
+				  */
+
+	len=strlen(string);     /* this length is only used in the loop
+				 * counter, we will work out the length of
+				 * the unicode string as we go */
+	plgesc(&esc);

-	    /* At this stage we will do some translations into unicode, like
-	     * conversion to Greek , and will save other translations such as
-	     * superscript for the driver to do later on. As we move through
-	     * the string and do the translations, we will get
-	     * rid of the esc character sequence, just replacing it with
-	     * unicode.
-	     */
+	/* At this stage we will do some translations into unicode, like
+	 * conversion to Greek , and will save other translations such as
+	 * superscript for the driver to do later on. As we move through
+	 * the string and do the translations, we will get
+	 * rid of the esc character sequence, just replacing it with
+	 * unicode.
+	 */

-	    /* Obtain FCI (font characterization integer) for start of
-	     * string. */
-	    plgfci(&fci);
-	    for (j=i=0;i<len;i++) {    /* Walk through the string, and convert
-					* some stuff to unicode on the fly */
-	       skip=0;
+	/* Obtain FCI (font characterization integer) for start of
+	 * string. */
+	plgfci(&fci);
+	for (j=i=0;i<len;i++) {    /* Walk through the string, and convert
+				    * some stuff to unicode on the fly */
+	  skip=0;

-	       if (string[i]==esc) {
-		  switch(string[i+1]) {
-		   case '(':  /* hershey code */
-		     i+=2+text2num(&string[i+2],')',&code);
-		     idx=plhershey2unicode(code);
-		     /* momentarily switch to symbol font. */
-		     fcisave = fci;
-		     plP_hex2fci(PL_FCI_SYMBOL, PL_FCI_FAMILY, &fci);
-		     unicode_buffer[j++]= fci;
-		     unicode_buffer[j++] = \
-		       (PLUNICODE)hershey_to_unicode_lookup_table[idx].Unicode;
-
-		     /* if unicode_buffer[j-1] corresponds to the escape
-		      * character must unescape it by appending one more.
-		      * This will probably always be necessary since it is
-		      * likely unicode_buffer will always have to contain
-		      * escape characters that are interpreted by the device
-		      * driver.
-		      */
-		     if (unicode_buffer[j-1]==esc) unicode_buffer[j++]=esc;
-		     fci = fcisave;
-		     unicode_buffer[j]= fci;
-		     skip=1;
-		     break;
+	  if (string[i]==esc) {
+	    switch(string[i+1]) {
+	    case '(':  /* hershey code */
+	      i+=2+text2num(&string[i+2],')',&code);
+	      idx=plhershey2unicode(code);
+	      /* momentarily switch to symbol font. */
+	      fcisave = fci;
+	      plP_hex2fci(PL_FCI_SYMBOL, PL_FCI_FAMILY, &fci);
+	      unicode_buffer[j++]= fci;
+	      unicode_buffer[j++] = \
+		(PLUNICODE)hershey_to_unicode_lookup_table[idx].Unicode;
+
+	      /* if unicode_buffer[j-1] corresponds to the escape
+	       * character must unescape it by appending one more.
+	       * This will probably always be necessary since it is
+	       * likely unicode_buffer will always have to contain
+	       * escape characters that are interpreted by the device
+	       * driver.
+	       */
+	      if (unicode_buffer[j-1]==esc) unicode_buffer[j++]=esc;
+	      fci = fcisave;
+	      unicode_buffer[j]= fci;
+	      skip=1;
+	      break;

-		   case '[':  /* unicode */
-		     i+=2+text2num(&string[i+2],']',&code);
-		     /* momentarily switch to symbol font. */
-		     fcisave = fci;
-		     plP_hex2fci(PL_FCI_SYMBOL, PL_FCI_FAMILY, &fci);
-		     unicode_buffer[j++]= fci;
-		     unicode_buffer[j++]=code;
-		     /* if unicode_buffer[j-1] corresponds to the escape
-		      * character must unescape it by appending one more.
-		      * This will probably always be necessary since it is
-		      * likely unicode_buffer will always have to contain
-		      * escape characters that are interpreted by the device
-		      * driver.
-		      */
-		     if (unicode_buffer[j-1]==esc) unicode_buffer[j++]=esc;
-		     fci = fcisave;
-		     unicode_buffer[j] = fci;
-		     skip=1;
-		     break;
+	    case '[':  /* unicode */
+	      i+=2+text2num(&string[i+2],']',&code);
+	      /* momentarily switch to symbol font. */
+	      fcisave = fci;
+	      plP_hex2fci(PL_FCI_SYMBOL, PL_FCI_FAMILY, &fci);
+	      unicode_buffer[j++]= fci;
+	      unicode_buffer[j++]=code;
+	      /* if unicode_buffer[j-1] corresponds to the escape
+	       * character must unescape it by appending one more.
+	       * This will probably always be necessary since it is
+	       * likely unicode_buffer will always have to contain
+	       * escape characters that are interpreted by the device
+	       * driver.
+	       */
+	      if (unicode_buffer[j-1]==esc) unicode_buffer[j++]=esc;
+	      fci = fcisave;
+	      unicode_buffer[j] = fci;
+	      skip=1;
+	      break;

-		   case '<':  /* change font*/
-		     i+=2;
-		     if ('0' <= string[i] && string[i] <= '9' ) {
-			i+=text2num(&string[i],'>', &code);
-			if (code & PL_FCI_MARK) {
-			   /* code is a complete FCI (font characterization
-			    * integer): change FCI to this value.
-			    */
-			   fci = code;
-			   unicode_buffer[j]=fci;
-			   skip=1;
-			}
-			else {
-			   /* code is not complete FCI. Change
-			    * FCI with hex power in rightmost hex
-			    * digit and hex digit value in second rightmost
-			    * hex digit.
-			    */
-			   hexdigit = (code >> 4) & PL_FCI_HEXDIGIT_MASK;
-			   hexpower = code & PL_FCI_HEXPOWER_MASK;
-			   plP_hex2fci(hexdigit, hexpower, &fci);
-			   unicode_buffer[j]=fci;
-			   skip=1;
-			}
-		     }
+	    case '<':  /* change font*/
+	      i+=2;
+	      if ('0' <= string[i] && string[i] <= '9' ) {
+		i+=text2num(&string[i],'>', &code);
+		if (code & PL_FCI_MARK) {
+		  /* code is a complete FCI (font characterization
+		   * integer): change FCI to this value.
+		   */
+		  fci = code;
+		  unicode_buffer[j]=fci;
+		  skip=1;
+		}
+		else {
+		  /* code is not complete FCI. Change
+		   * FCI with hex power in rightmost hex
+		   * digit and hex digit value in second rightmost
+		   * hex digit.
+		   */
+		  hexdigit = (code >> 4) & PL_FCI_HEXDIGIT_MASK;
+		  hexpower = code & PL_FCI_HEXPOWER_MASK;
+		  plP_hex2fci(hexdigit, hexpower, &fci);
+		  unicode_buffer[j]=fci;
+		  skip=1;
+		}
+	      }

-		     else {
-			/* align i on "<" because that is what text2fci
-			 * expects. */
-			i--;
-			i+=text2fci(&string[i], &hexdigit, &hexpower);
-			if (hexpower < 7) {
-			   plP_hex2fci(hexdigit, hexpower, &fci);
-			   unicode_buffer[j]=fci;
-			   skip=1;
-			}
-		     }
-		     break;
+	      else {
+		/* align i on "<" because that is what text2fci
+		 * expects. */
+		i--;
+		i+=text2fci(&string[i], &hexdigit, &hexpower);
+		if (hexpower < 7) {
+		  plP_hex2fci(hexdigit, hexpower, &fci);
+		  unicode_buffer[j]=fci;
+		  skip=1;
+		}
+	      }
+	      break;

-		   case 'f':  /* Deprecated Hershey-style font change*/
-		   case 'F':  /* Deprecated Hershey-style font change*/
-		     /* We implement an approximate response here so that
-		      * reasonable results are obtained for unicode fonts,
-		      * but this method is deprecated and the #<nnn> or
-		      * #<command string> methods should be used instead
-		      * to change unicode fonts in mid-string.
-		      */
-		     fci = PL_FCI_MARK;
-		     if (string[i+2] == 'n') {
-			/* medium, upright, sans-serif */
-			plP_hex2fci(PL_FCI_SANS, PL_FCI_FAMILY, &fci);
-		     } else if (string[i+2] == 'r') {
-			/* medium, upright, serif */
-			plP_hex2fci(PL_FCI_SERIF, PL_FCI_FAMILY, &fci);
-		     } else if (string[i+2] == 'i') {
-			/* medium, italic, serif */
-			plP_hex2fci(PL_FCI_ITALIC, PL_FCI_STYLE, &fci);
-			plP_hex2fci(PL_FCI_SERIF, PL_FCI_FAMILY, &fci);
-		     } else if (string[i+2] == 's') {
-			/* medium, upright, script */
-			plP_hex2fci(PL_FCI_SCRIPT, PL_FCI_FAMILY, &fci);
-		     } else
-		       fci = PL_FCI_IMPOSSIBLE;
+	    case 'f':  /* Deprecated Hershey-style font change*/
+	    case 'F':  /* Deprecated Hershey-style font change*/
+	      /* We implement an approximate response here so that
+	       * reasonable results are obtained for unicode fonts,
+	       * but this method is deprecated and the #<nnn> or
+	       * #<command string> methods should be used instead
+	       * to change unicode fonts in mid-string.
+	       */
+	      fci = PL_FCI_MARK;
+	      if (string[i+2] == 'n') {
+		/* medium, upright, sans-serif */
+		plP_hex2fci(PL_FCI_SANS, PL_FCI_FAMILY, &fci);
+	      } else if (string[i+2] == 'r') {
+		/* medium, upright, serif */
+		plP_hex2fci(PL_FCI_SERIF, PL_FCI_FAMILY, &fci);
+	      } else if (string[i+2] == 'i') {
+		/* medium, italic, serif */
+		plP_hex2fci(PL_FCI_ITALIC, PL_FCI_STYLE, &fci);
+		plP_hex2fci(PL_FCI_SERIF, PL_FCI_FAMILY, &fci);
+	      } else if (string[i+2] == 's') {
+		/* medium, upright, script */
+		plP_hex2fci(PL_FCI_SCRIPT, PL_FCI_FAMILY, &fci);
+	      } else
+		fci = PL_FCI_IMPOSSIBLE;

-		     if (fci != PL_FCI_IMPOSSIBLE){
-			i+=2;
-			unicode_buffer[j] = fci;
-			skip = 1;
-		     }
-		     break;
+	      if (fci != PL_FCI_IMPOSSIBLE){
+		i+=2;
+		unicode_buffer[j] = fci;
+		skip = 1;
+	      }
+	      break;

-		   case 'g':  /* Greek font */
-		   case 'G':  /* Greek font */
-		     /* Get the index in the lookup table
-		      * 527 = upper case alpha displacement in Hershey Table
-		      * 627 = lower case alpha displacement in Hershey Table
-		      */
-		     /* momentarily switch to symbol font. */
-		     fcisave = fci;
-		     plP_hex2fci(PL_FCI_SYMBOL, PL_FCI_FAMILY, &fci);
-		     unicode_buffer[j++]= fci;
-		     ig = plP_strpos(plP_greek_mnemonic, string[i+2]);
-		     if (ig >= 0) {
-			if (ig >= 24)
-			  ig = ig + 100 - 24;
-			idx=plhershey2unicode(ig+527);
-			unicode_buffer[j++] = \
-			  (PLUNICODE)hershey_to_unicode_lookup_table[idx].Unicode;
-			i+=2;
-			skip=1;  /* skip is set if we have copied something
-				  * into the unicode table */
-		     }
-		     else {
-			/* Use "unknown" unicode character if string[i+2]
-			 * is not in the Greek array.*/
-			unicode_buffer[j++]=(PLUNICODE)0x00;
-			i+=2;
-			skip=1;  /* skip is set if we have copied something
-				  * into  the unicode table */
-		     }
-		     fci = fcisave;
-		     unicode_buffer[j]= fci;
-		     break;
+	    case 'g':  /* Greek font */
+	    case 'G':  /* Greek font */
+	      /* Get the index in the lookup table
+	       * 527 = upper case alpha displacement in Hershey Table
+	       * 627 = lower case alpha displacement in Hershey Table
+	       */
+	      /* momentarily switch to symbol font. */
+	      fcisave = fci;
+	      plP_hex2fci(PL_FCI_SYMBOL, PL_FCI_FAMILY, &fci);
+	      unicode_buffer[j++]= fci;
+	      ig = plP_strpos(plP_greek_mnemonic, string[i+2]);
+	      if (ig >= 0) {
+		if (ig >= 24)
+		  ig = ig + 100 - 24;
+		idx=plhershey2unicode(ig+527);
+		unicode_buffer[j++] = \
+		  (PLUNICODE)hershey_to_unicode_lookup_table[idx].Unicode;
+		i+=2;
+		skip=1;  /* skip is set if we have copied something
+			  * into the unicode table */
+	      }
+	      else {
+		/* Use "unknown" unicode character if string[i+2]
+		 * is not in the Greek array.*/
+		unicode_buffer[j++]=(PLUNICODE)0x00;
+		i+=2;
+		skip=1;  /* skip is set if we have copied something
+			  * into  the unicode table */
+	      }
+	      fci = fcisave;
+	      unicode_buffer[j]= fci;
+	      break;

-		  }
-	       }
+	    }
+	  }

-	       if (skip==0) {
-#ifdef HAVE_LIBUNICODE
-                  unicode_char_t unichar;
-		  char* ptr =
-		    unicode_get_utf8 (string + i, &unichar);
-                  if (ptr == NULL) {
-                    char buf[80];
-                    strncpy (buf, string, 30);
-                    sprintf (buf, "UTF-8 string is malformed: %s%s",
-                             buf, strlen (string) > 30 ? "[...]" : "");
-                    plabort (buf);
-                  }
-                  unicode_buffer [j] = (PLUNICODE) unichar;
-                  i += ptr - (string + i) - 1;
-#else
-		  unicode_buffer[j]=string[i];
-#endif
-		  /* Search for escesc (an unescaped escape) in the input
-		   * string and adjust unicode_buffer accordingly).
-		   */
-		  if (unicode_buffer[j] == esc && string[i+1] == esc) {
-		    i++;
-		    unicode_buffer[++j] = esc;
-		  }
-	       }
-	       j++;
+	  /* This section does utf8 to ucs4 conversion without
+	   * using libunicode.  Contributed by Per Persson.
+	   */
+	  if (skip==0) {
+	    PLUNICODE unichar;
+	    char tmp;
+	    char *ptr = (char *)string + i;
+	    int isFirst = 1;
+	    int cnt;
+
+	    do {
+	      /* Get next character in string */
+	      tmp = *ptr++;
+	      if (isFirst) { /* First char in UTF8 sequence */
+		isFirst = 0;
+		/* Determine length of sequence */
+		if ((unsigned char)(tmp&0x80)==0x00) { /* single char */
+		  unichar = (PLUNICODE)tmp & 0x7F;
+		  cnt = 0;
+		} else if((unsigned char)(tmp&0xE0)==0xC0) { /* 2 chars */
+		  unichar = (PLUNICODE)(tmp & 0x1F);
+		  cnt = 1;
+		} else if((unsigned char)(tmp&0xF0)==0xE0) { /* 3 chars */
+		  unichar = (PLUNICODE)(tmp & 0x0F);
+		  cnt = 2;
+		} else if((unsigned char)(tmp&0xF8)==0xF0) { /* 4 chars */
+		  unichar = (PLUNICODE)(tmp & 0x07);
+		  cnt = 3;
+		} else if((unsigned char)(tmp&0xFC)==0xF8) { /* 5 chars */
+		  unichar = (PLUNICODE)(tmp & 0x03);
+		  cnt = 4;
+		} else if((unsigned char)(tmp&0xFE)==0xFC) { /* 6 chars */
+		  unichar = (PLUNICODE)(tmp & 0x01);
+		  cnt = 5;
+		} else { /* Malformed */
+		  ptr = NULL;
+		  cnt = 0;
+		}
+	      }
+	      else { /* Subsequent char in UTF8 sequence */
+		if ((unsigned char)(tmp & 0xC0) == 0x80) {
+		  unichar = (unichar << 6) | ((PLUNICODE)(tmp & 0x3F));
+		  cnt--;
+		}
+		else { /* Malformed */
+		  ptr = NULL;
+		  cnt = 0;
+		}
+	      }
+	    } while (cnt > 0);
+	    if (ptr == NULL) {
+	      char buf[80];
+	      strncpy (buf, string, 30);
+	      sprintf (buf, "UTF-8 string is malformed: %s%s",
+		       buf, strlen (string) > 30 ? "[...]" : "");
+	      plabort (buf);
 	    }
-	    if (j > 0) {
-	       args.unicode_array_len=j; /* Much easier to set the length than
-					  * work it out later :-) */
-	       args.unicode_array=&unicode_buffer[0]; /* Get address of the
-						       * unicode buffer (even
-						       * though it is
-						       * currently  static) */
-	    } else
-	      /* Don't print anything, if there is no unicode to print! */
-	      return;
-	 }
-      }
+	    unicode_buffer [j] = (PLUNICODE) unichar;
+	    i = ptr - string - 1;
+
+
+	    /* Search for escesc (an unescaped escape) in the input
+	     * string and adjust unicode_buffer accordingly).
+	     */
+	    if (unicode_buffer[j] == esc && string[i+1] == esc) {
+	      i++;
+	      unicode_buffer[++j] = esc;
+	    }
+	  } /* if (skip==0) */
+	  j++;
+	}  /* for (j=i=0;i<len;i++) */
+
+	if (j > 0) {
+	  args.unicode_array_len=j; /* Much easier to set the length than
+				     * work it out later :-) */
+	  args.unicode_array=&unicode_buffer[0]; /* Get address of the
+						  * unicode buffer (even
+						  * though it is
+						  * currently  static) */
+	} else
+	  /* Don't print anything, if there is no unicode to print! */
+	  return;
+      } /* if (string!=NULL) */
+    }/* if (plsc->dev_unicode) */

-      if (plsc->dev_unicode) {
-	args.string=NULL; /* We are using unicode */
-      }
-      else  {
-	args.string = string;
-      }
+    if (plsc->dev_unicode) {
+      args.string=NULL; /* We are using unicode */
+    }
+    else  {
+      args.string = string;
+    }

-      plP_esc(PLESC_HAS_TEXT, &args);
+    plP_esc(PLESC_HAS_TEXT, &args);
 #ifndef DEBUG_TEXT
-   } else {
+  } else {
 #endif
-      plstr(base, xform, refx, refy, string);
-   }
+    plstr(base, xform, refx, refy, string);
+  }
 }

 static void

--
Thomas J. Duck <to...@fi...>

  Department of Physics and Atmospheric Science, Dalhousie University,
    Halifax, Nova Scotia, Canada, B3H 3J5.
  Tel: (902)494-1456 | Fax: (902)494-5191 | Lab: (902)494-3813
  Web: http://aolab.phys.dal.ca/~tomduck/
  Public key: http://pgp.mit.edu:11371/pks/lookup?op=get&search=0x17D965DB


[Plplot-devel] Re: Unicode and plplot

Cross-platform, scientific graphics plotting library

[Plplot-devel] Re: Unicode and plplot