gnosygnu - 2014-05-07

Hi! I'm the developer for XOWA -- an offline Wikipedia application. About a month ago, I changed XOWA to use Luaj to process the lua statements in Wikipedia's offline dumps. I'd just like to say how impressed I was in Luaj's functionality and performance. So far, it handles all the lua code in 30+ wikis, including en.wikipedia.org, en.wiktionary.org, etc..

In the process of integrating luaj into XOWA, I came across a minor number of issues. I don't know if this board is still active, but I wanted to list them below. Admittedly, the format is unstructured. If there is a formal process for submitting patches to Luaj, let me know, and I will follow up accordingly.

You can also download XOWA's modified source of luaj here: https://sourceforge.net/projects/xowa/files/support/luaj/v1/

Thanks for developing such a great library!


  • os.date does not accept UTC format
    • file: /src/core/org/luaj/vm2/lib/OsLib.java
    • proc: invoke.DATE

boolean utc = false;
if (s.startsWith("!")) {
  utc = true;
  s = s.substring(1);
}
if (s.equals("*t")) {
  Calendar d = Calendar.getInstance();
  long time_in_ms = (long)(t*1000);
  if (utc) {
    java.util.TimeZone current_tz = d.getTimeZone();
    int offset_from_utc = current_tz.getOffset(time_in_ms);
    time_in_ms += -offset_from_utc;
  }

  • string.gsub fails with out_of_bounds error
    • file: /src/core/org/luaj/vm2/lib/StringLib.java
    • proc: gsub

      old:
          if ( anchor )
            break;
      new:
          if ( anchor )
            break;
          if (soffset >= srclen) break; // assert soffset is in bounds, else will throw ArrayIndexOutOfBounds exception;

  • string.gsub fails if string is empty
    • file: /src/core/org/luaj/vm2/lib/StringLib.java
    • proc: gsub

      static Varargs gsub( Varargs args ) {
        LuaString src = args.checkstring( 1 );
        final int srclen = src.length();
        if (srclen == 0) return varargsOf(src, LuaValue.ZERO); // exit early

  • string.format ignores precision for double args
    • file: /src/core/org/luaj/vm2/lib/StringLib.java
    • note: also fixes format failures
      • %0.1f -> remove leading 0
      • %02.f -> add trailing 0 after .
    • note: forces a 1.5 JRE (as opposed to 1.3)

                        FormatDesc fdsc = new FormatDesc(args, fmt, i );
                        int fdsc_bgn = i;

      old:
                        case 'G':
                            fdsc.format( result, args.checkdouble( arg ) );
      new:
                        case 'G':
                            String fmt_str = new String(fmt.m_bytes, fdsc_bgn - 1, fdsc.length + 1);    // -1 to include %; +1 to account for included %; basically get everything between % and f; EX: a%.1fb -> %.1f 
                            fdsc.format( result, fmt_str, args.checkdouble( arg ));

* proc: format

      old: 
                        buf.append( v) );
      new:
            // buf.append( String.valueOf( x ) );
            if (fmt.startsWith("%0."))
                fmt = "%" + fmt.substring(2);               // remove leading 0, else MissingFormatWidthException
            int fmt_len = fmt.length();
            if (fmt_len > 1 && fmt.charAt(fmt_len - 2) == '.')  // penultimmate char has "."
                fmt = fmt.substring(0, fmt_len - 1) + "0" + fmt.charAt(fmt_len - 1);    // add trailing 0, else UnknownFormatConversionException; EX: "02.f" -> "02.0f" 
            buf.append( String.format(fmt, v) );    // call String.format

  • string.gmatch issues
    • file: /src/core/org/luaj/vm2/lib/StringLib.java
    • proc: GmatchAux.invoke

      old:
            for ( ; soffset<srclen; soffset++ ) {
      new:
            for ( ; soffset<=srclen; soffset++ ) {

      old:
                                        soffset = res;
      new:
                    int soffset_adj = res == soffset ? 1 : 0;
                    soffset = res + soffset_adj;

  • string.tonumber should trim all whitespace
    • file: /src/core/org/luaj/vm2/LuaString.java
    • proc: scannumber

    // trim ws
    int idx = i;
    while (idx < j) {
        switch (m_bytes[idx]) {
            case 9: case 10: case 13: case 32:
                ++idx;
                i = idx;
                break;
            default:
                idx = j;
                break;
        }
    }
    idx = j - 1;
    while (idx >= i) {
        switch (m_bytes[idx]) {
            case 9: case 10: case 13: case 32:
                j = idx;
                --idx;
                break;
            default:
                idx = i -1;
                break;
        }
    }

  • multi-byte strings not fully supported
    • file: /src/core/org/luaj/vm2/LuaString.java

    public static LuaString valueOf(char[] chars, int off, int len) {
        // COMMENTED: does not handle 2+ byte chars; assumes 1 char = 1 byte
//      byte[] b = new byte[len];
//      for ( int i=0; i<len; i++ )
//          b[i] = (byte) chars[i + off];
//      return valueOf(b, 0, len);
        int bry_len = 0;
        for (int i = 0; i < len; i++) { // iterate over chars to sum all single / multi-byte chars
            int b_len = LuaString.Utf16_Len_by_char((int)(chars[i + off]));
            if (b_len == 4) ++i;        // 4 bytes; surrogate pair; skip next char;
            bry_len += b_len;  
        }
        byte[] bry = new byte[bry_len];
        int bry_idx = 0;
        int i = 0;
        while (i < len) {
          char c = chars[i + off];
          int b_len = Utf16_Encode_char(c, chars, i, bry, bry_idx);
          bry_idx += b_len;
          i += b_len == 4 ? 2 : 1;      // 4 bytes; surrogate pair; skip next char;
        }
        return valueOf(bry, 0, bry_len);
    }

    public static String decodeAsUtf8(byte[] bytes, int offset, int length) {
        // COMMENTED: does not handle 3+ byte chars
//      int i,j,n,b;
//      for ( i=offset,j=offset+length,n=0; i<j; ++n ) {
//          switch ( 0xE0 & bytes[i++] ) {
//          case 0xE0: ++i;
//          case 0xC0: ++i;
//          }
//      }
//      char[] chars=new char[n];
//      for ( i=offset,j=offset+length,n=0; i<j; ) {
//          chars[n++] = (char) (
//              ((b=bytes[i++])>=0||i>=j)? b:
//              (b<-32||i+1>=j)? (((b&0x3f) << 6) | (bytes[i++]&0x3f)):
//                  (((b&0xf) << 12) | ((bytes[i++]&0x3f)<<6) | (bytes[i++]&0x3f)));
//      }
//      return new String(chars);
        return new String(bytes, offset, length, java.nio.charset.Charset.forName("UTF-8"));
    }

    public static int lengthAsUtf8(char[] chars) {
        // COMMENTED: does not handle 3+ byte chars
//      int i,b;
//      char c;
//      for ( i=b=chars.length; --i>=0; )
//          if ( (c=chars[i]) >=0x80 )
//              b += (c>=0x800)? 2: 1;
//      return b;
        int len = chars.length;
        int rv = 0;
        for (int i = 0; i < len; i++) {
            int b_len = LuaString.Utf16_Len_by_char(chars[i]);
            if (b_len == 4) ++i;        // 4 bytes; surrogate pair; skip next char; 
            rv += b_len;  
        }
        return rv;
    }

    public static int encodeToUtf8(char[] chars, int nchars, byte[] bytes, int off) {
        // COMMENTED: does not handle 4+ byte chars; already using Encode_by_int, so might as well be consistent
//      char c;
//      int j = off;
//      for ( int i=0; i<nchars; i++ ) {
//          if ( (c = chars[i]) < 0x80 ) {
//              bytes[j++] = (byte) c;
//          } else if ( c < 0x800 ) {
//              bytes[j++] = (byte) (0xC0 | ((c>>6)  & 0x1f));
//              bytes[j++] = (byte) (0x80 | ( c      & 0x3f));              
//          } else {
//              bytes[j++] = (byte) (0xE0 | ((c>>12) & 0x0f));
//              bytes[j++] = (byte) (0x80 | ((c>>6)  & 0x3f));
//              bytes[j++] = (byte) (0x80 | ( c      & 0x3f));              
//          }
//      }
//      return j - off;
        int bry_idx = off;
        int i = 0;
        while (i < nchars) {
          char c = chars[i];
          int bytes_read = Utf16_Encode_char(c, chars, i, bytes, bry_idx);
          bry_idx += bytes_read;
          i += bytes_read == 4 ? 2 : 1; // 4 bytes; surrogate pair; skip next char;
        }
        return nchars;  // NOTE: code returned # of bytes which is wrong; Globals.UTF8Stream.read caches rv as j which is used as index to char[] not byte[]; will throw out of bounds exception if bytes returned
    }

    private static int Utf16_Len_by_char(int c) {
        if     ((c >       -1)
             && (c <      128)) return 1;       // 1 <<  7
        else if (c <     2048)  return 2;       // 1 << 11
        else if((c > 55295)                     // 0xD800
             && (c < 56320))    
            return 4;       // 0xDFFF
        else if (c <   65536)   return 3;       // 1 << 16
        else throw new RuntimeException("UTF-16 int must be between 0 and 2097152; char=" + c);
    }
    public static int Utf16_Len_by_int(int c) {
        if     ((c >       -1)
             && (c <      128)) return 1;       // 1 <<  7
        else if (c <     2048)  return 2;       // 1 << 11
        else if (c <   65536)   return 3;       // 1 << 16
        else if (c < 2097152)   return 4;
        else throw new RuntimeException("UTF-16 int must be between 0 and 2097152; char=" + c);
    }
    public static int Utf8_Len_of_char_by_1st_byte(byte b) {// SEE:w:UTF-8
        int i = b & 0xff;   // PATCH.JAVA:need to convert to unsigned byte
        switch (i) {
            case   0: case   1: case   2: case   3: case   4: case   5: case   6: case   7: case   8: case   9: case  10: case  11: case  12: case  13: case  14: case  15: 
            case  16: case  17: case  18: case  19: case  20: case  21: case  22: case  23: case  24: case  25: case  26: case  27: case  28: case  29: case  30: case  31: 
            case  32: case  33: case  34: case  35: case  36: case  37: case  38: case  39: case  40: case  41: case  42: case  43: case  44: case  45: case  46: case  47: 
            case  48: case  49: case  50: case  51: case  52: case  53: case  54: case  55: case  56: case  57: case  58: case  59: case  60: case  61: case  62: case  63: 
            case  64: case  65: case  66: case  67: case  68: case  69: case  70: case  71: case  72: case  73: case  74: case  75: case  76: case  77: case  78: case  79: 
            case  80: case  81: case  82: case  83: case  84: case  85: case  86: case  87: case  88: case  89: case  90: case  91: case  92: case  93: case  94: case  95: 
            case  96: case  97: case  98: case  99: case 100: case 101: case 102: case 103: case 104: case 105: case 106: case 107: case 108: case 109: case 110: case 111: 
            case 112: case 113: case 114: case 115: case 116: case 117: case 118: case 119: case 120: case 121: case 122: case 123: case 124: case 125: case 126: case 127:
            case 128: case 129: case 130: case 131: case 132: case 133: case 134: case 135: case 136: case 137: case 138: case 139: case 140: case 141: case 142: case 143: 
            case 144: case 145: case 146: case 147: case 148: case 149: case 150: case 151: case 152: case 153: case 154: case 155: case 156: case 157: case 158: case 159: 
            case 160: case 161: case 162: case 163: case 164: case 165: case 166: case 167: case 168: case 169: case 170: case 171: case 172: case 173: case 174: case 175: 
            case 176: case 177: case 178: case 179: case 180: case 181: case 182: case 183: case 184: case 185: case 186: case 187: case 188: case 189: case 190: case 191: 
                return 1;
            case 192: case 193: case 194: case 195: case 196: case 197: case 198: case 199: case 200: case 201: case 202: case 203: case 204: case 205: case 206: case 207: 
            case 208: case 209: case 210: case 211: case 212: case 213: case 214: case 215: case 216: case 217: case 218: case 219: case 220: case 221: case 222: case 223: 
                return 2;
            case 224: case 225: case 226: case 227: case 228: case 229: case 230: case 231: case 232: case 233: case 234: case 235: case 236: case 237: case 238: case 239: 
                return 3;
            case 240: case 241: case 242: case 243: case 244: case 245: case 246: case 247:
                return 4;
            default: throw new RuntimeException("invalid initial utf8 byte; byte=" + b);
        }
    }
    public static int Utf16_Decode_to_int(byte[] ary, int pos) {
        byte b0 = ary[pos];
        if      ((b0 & 0x80) == 0) {
            return  b0;         
        }
        else if ((b0 & 0xE0) == 0xC0) {
            return  ( b0           & 0x1f) <<  6
                |   ( ary[pos + 1] & 0x3f)
                ;           
        }
        else if ((b0 & 0xF0) == 0xE0) {
            return  ( b0           & 0x0f) << 12
                |   ((ary[pos + 1] & 0x3f) <<  6)
                |   ( ary[pos + 2] & 0x3f)
                ;           
        }
        else if ((b0 & 0xF8) == 0xF0) {
            return  ( b0           & 0x07) << 18
                |   ((ary[pos + 1] & 0x3f) << 12)
                |   ((ary[pos + 2] & 0x3f) <<  6)
                |   ( ary[pos + 3] & 0x3f)
                ;           
        }
        else throw new RuntimeException("invalid utf8 byte: byte=" + b0);
    }
    public static int Utf16_Encode_int(int c, byte[] src, int pos) {
        if     ((c > -1)
             && (c < 128)) {
            src[  pos]  = (byte)c;
            return 1;
        }
        else if (c < 2048) {
            src[  pos]  = (byte)(0xC0 | (c >>   6));
            src[++pos]  = (byte)(0x80 | (c & 0x3F));
            return 2;
        }   
        else if (c < 65536) {
            src[pos]    = (byte)(0xE0 | (c >> 12));
            src[++pos]  = (byte)(0x80 | (c >>  6) & 0x3F);
            src[++pos]  = (byte)(0x80 | (c        & 0x3F));
            return 3;
        }
        else if (c < 2097152) {
            src[pos]    = (byte)(0xF0 | (c >> 18));
            src[++pos]  = (byte)(0x80 | (c >> 12) & 0x3F);
            src[++pos]  = (byte)(0x80 | (c >>  6) & 0x3F);
            src[++pos]  = (byte)(0x80 | (c        & 0x3F));
            return 4;
        }
        else throw new RuntimeException("UTF-16 int must be between 0 and 2097152; char=" + c);
    }
    public static int Utf16_Encode_char(int c, char[] c_ary, int c_pos, byte[] b_ary, int b_pos) {
        if     ((c >   -1)
             && (c < 128)) {
            b_ary[  b_pos]  = (byte)c;
            return 1;
        }
        else if (c < 2048) {
            b_ary[  b_pos]  = (byte)(0xC0 | (c >>   6));
            b_ary[++b_pos]  = (byte)(0x80 | (c & 0x3F));
            return 2;
        }   
        else if((c > 55295)             // 0xD800
             && (c < 56320)) {          // 0xDFFF
            if (c_pos >= c_ary.length)
                throw new RuntimeException("incomplete surrogate pair at end of string; char=" + c);
            int nxt_char = c_ary[c_pos + 1];
            int v = Utf16_Surrogate_merge(c, nxt_char);
            b_ary[b_pos]    = (byte)(0xF0 | (v >> 18));
            b_ary[++b_pos]  = (byte)(0x80 | (v >> 12) & 0x3F);
            b_ary[++b_pos]  = (byte)(0x80 | (v >>  6) & 0x3F);
            b_ary[++b_pos]  = (byte)(0x80 | (v        & 0x3F));
            return 4;
        }
        else {
            b_ary[b_pos]    = (byte)(0xE0 | (c >> 12));
            b_ary[++b_pos]  = (byte)(0x80 | (c >>  6) & 0x3F);
            b_ary[++b_pos]  = (byte)(0x80 | (c        & 0x3F));
            return 3;
        }
    }
    private static int Utf16_Surrogate_merge(int hi, int lo) { // REF: http://perldoc.perl.org/Encode/Unicode.html
        return 0x10000 + (hi - 0xD800) * 0x400 + (lo - 0xDC00);
    }
* file: /src/core/org/luaj/vm2/compiler/LexState.java
* proc: read_string

                        if (c > UCHAR_MAX)
                            lexerror("escape sequence too large", TK_STRING);
                        save(c, false); // NOTE: specify that c is integer and does not need conversion; EX: \128 -> 128 -> (char)128, not Utf8_encode(128)

* file: /src/core/org/luaj/vm2/compiler/LexState.java

    void save(int c) {save(c, true);}
    void save(int c, boolean c_might_be_utf8) {
        int bytes_len = c_might_be_utf8 ? LuaString.Utf8_Len_of_char_by_1st_byte((byte)c) : 1;
        if (bytes_len > 1) {    // c is 1st byte of utf8 multi-byte sequence; read required number of bytes and convert to char; EX: left-arrow is serialized in z as 226,134,144; c is currently 226; read 134 and 144 and convert to left-arrow
            temp_bry[0] = (byte)c;
            for (int i = 1; i < bytes_len; i++) {
                nextChar();
                temp_bry[i] = (byte)current; 
            }
            c = LuaString.Utf16_Decode_to_int(temp_bry, 0);         
        }
        if ( buff == null || nbuff + 1 > buff.length )
            buff = LuaC.realloc( buff, nbuff*2+1 );
        buff[nbuff++] = (char)c;
    }
    private static byte[] temp_bry = new byte[6];
 

Last edit: gnosygnu 2014-05-07