Péter Szabó - 2008-11-15

Sorry, the previous one didn't update p->col properly. Here is the correct solution.

This is a speed optimization which makes p_goto_eol faster on UTF-8 and crlf
buffers. Our general observation is that we don't need special casing in
case of UTF-8, because it keeps '\n', '\t' and '\n' intact. The only special
casing we need in case of crlf is to stop at \r.

Index: b.c

--- b.c (revision 40)
+++ b.c (working copy)
@@ -721,15 +721,62 @@
return c;
}

+/** Read the rest of a UTF-8 character, and return the Unicode
+ * code point. c is the first byte (0 <= c <= 0xff). p is positioned as after
+ * c = pgetb(p). This function advances p past the end of the multibyte
+ * character. This function assumes an UTF-8 buffer. This function uses
+ * pgetb, so it may invalidate p->col (i.e. setting p->valcol = 0).
+ */
+static int pget_restc(P *p, int c) {
+ int n, d;
+
+ if ((c&0xE0)==0xC0) { /* Two bytes */
+ n = 1;
+ c &= 0x1F;
+ } else if ((c&0xF0)==0xE0) { /* Three bytes */
+ n = 2;
+ c &= 0x0F;
+ } else if ((c&0xF8)==0xF0) { /* Four bytes */
+ n = 3;
+ c &= 0x07;
+ } else if ((c&0xFC)==0xF8) { /* Five bytes */
+ n = 4;
+ c &= 0x03;
+ } else if ((c&0xFE)==0xFC) { /* Six bytes */
+ n = 5;
+ c &= 0x01;
+ } else if ((c&0x80)==0x00) { /* One byte */
+ n = 0;
+ } else { /* 128-191, 254, 255: Not a valid UTF-8 start character */
+ n = 0;
+ c = 'X';
+ /* c -= 384; */
+ }
+ if (n) {
+ while (n) {
+ d = brc(p);
+ if ((d&0xC0)!=0x80)
+ break;
+ pgetb(p);
+ c = ((c<<6)|(d&0x3F));
+ --n;
+ }
+ if (n) { /* FIXME: there was a bad UTF-8 sequence */
+ /* How to represent this? */
+ /* pbkwd(p,m-n);
+ c = oc - 384; */
+ c = 'X';
+ }
+ }
+ return c;
+}
+
/* return current character and move p to the next character. column will be updated if it was valid. */
int pgetc(P *p)
{
if (p->b->o.charmap->type) {
int val;
int c; /* , oc; */
- int d;
- int n; /* , m; */
- int wid = 0;

val = p->valcol; /* Remember if column number was valid */
c = pgetb(p); /* Get first byte */
@@ -737,53 +784,10 @@

if (c==NO_MORE_DATA)
return c;
+
+ /* Read the rest of the multibyte character. */
+ if (c & 0x80) c = pget_restc(p, c);

- if ((c&0xE0)==0xC0) { /* Two bytes */
- n = 1;
- c &= 0x1F;
- } else if ((c&0xF0)==0xE0) { /* Three bytes */
- n = 2;
- c &= 0x0F;
- } else if ((c&0xF8)==0xF0) { /* Four bytes */
- n = 3;
- c &= 0x07;
- } else if ((c&0xFC)==0xF8) { /* Five bytes */
- n = 4;
- c &= 0x03;
- } else if ((c&0xFE)==0xFC) { /* Six bytes */
- n = 5;
- c &= 0x01;
- } else if ((c&0x80)==0x00) { /* One byte */
- n = 0;
- } else { /* 128-191, 254, 255: Not a valid UTF-8 start character */
- n = 0;
- c = 'X';
- /* c -= 384; */
- }
-
- /* m = n; */
-
- if (n) {
- while (n) {
- d = brc(p);
- if ((d&0xC0)!=0x80)
- break;
- pgetb(p);
- c = ((c<<6)|(d&0x3F));
- --n;
- }
- if (n) { /* FIXME: there was a bad UTF-8 sequence */
- /* How to represent this? */
- /* pbkwd(p,m-n);
- c = oc - 384; */
- c = 'X';
- wid = 1;
- } else if (val)
- wid = joe_wcwidth(1,c);
- } else {
- wid = 1;
- }
-
if (val) { /* Update column no. if it was valid to start with */
p->valcol = 1;
if (c=='\t')
@@ -791,7 +795,7 @@
else if (c=='\n')
p->col = 0;
else
- p->col += wid;
+ p->col += (c >= 0x80) ? joe_wcwidth(1, c) : 1;
}

return c;
@@ -974,20 +978,25 @@
/* move p to the end of line */
P *p_goto_eol(P *p)
{
- if (p->b->o.crlf || p->b->o.charmap->type)
- while (!piseol(p))
- pgetc(p);
- else
+ int is_utf8 = p->b->o.charmap->type;
+ int is_crlf = p->b->o.crlf;
+ if (!p->valcol) p_goto_bol(p);
+ if (1)
while (p->ofst != GSIZE(p->hdr)) {
unsigned char c;

c = GCHAR(p);
if (c == '\n')
break;
+ else if (c == '\r' && is_crlf && piseol(p))
+ break;
else {
++p->byte;
++p->ofst;
- if (c == '\t')
+ if ((c & 0x80) && is_utf8) {
+ c = pget_restc(p, c);
+ p->col += joe_wcwidth(1, c);
+ } else if (c == '\t')
p->col += p->b->o.tab - p->col % p->b->o.tab;
else
++p->col;
@@ -995,6 +1004,7 @@
pnext(p);
}
}
+ p->valcol = 1; /* restore invalidation by pget_restc. */
return p;
}