Work at SourceForge, help us to make it a better place! We have an immediate need for a Support Technician in our San Francisco or Denver office.

Close

#5 Bytestrings are not converted to default encoding

open
nobody
None
5
2005-07-05
2005-07-05
Anonymous
No

jaenichen@globalpark.de:

Compressed UTF-16 strings in BIFF8 are not converted to
the default encoding because PHP-ExcelReader treats
them like ASCII strings.

Added parsing of codepage-record and uncompressing of
compressed UTF-16 strings to PHP-ExcelReader.

The patchfile is attached to this message.

Discussion

  • Logged In: NO

    jaenichen@globalpark.de:
    Somehow couldn't upload the patchfile, so here it is:

    --- Excel/reader.php 2004-10-04 19:54:44.000000000 +0200
    +++ reader.php 2005-07-05 15:03:31.000000000 +0200
    @@ -42,6 +42,7 @@
    define('Spreadsheet_Excel_Reader_Type_UNKNOWN', 0xffff);

    define('Spreadsheet_Excel_Reader_Type_NINETEENFOUR', 0x22);

    define('Spreadsheet_Excel_Reader_Type_MERGEDCELLS', 0xE5);

    +define('Spreadsheet_Excel_Reader_Type_CODEPAGE', 0x42);

    define('Spreadsheet_Excel_Reader_utcOffsetDays' , 25569);

    define('Spreadsheet_Excel_Reader_utcOffsetDays1904', 24107);

    @@ -79,6 +80,7 @@
    var $pos;

    var $_ole;

    var $_defaultEncoding;

    + var $_codepage;

    var $_defaultFormat =
    Spreadsheet_Excel_Reader_DEF_NUM_FORMAT;

    var $_columnsFormat = array();

    var $_rowoffset = 1;

    @@ -333,7 +335,13 @@

    }

    }

    - $retstr =
    ($asciiEncoding) ? $retstr : $this->_encodeUTF16($retstr);

    +

    + // convert
    string according codepage and BIFF version

    + if($version
    == Spreadsheet_Excel_Reader_BIFF8)

    + $retstr =
    $this->_encodeUTF16($retstr, $asciiEncoding);

    + else

    + $retstr =
    $this->_decodeCodepage($retstr);

    +

    // echo "Str
    $i = $retstr\n";

    if ($richString){

    $spos +=
    4 * $formattingRuns;

    @@ -455,6 +463,130 @@

    'offset'=>$rec_offset);

    break;

    + case Spreadsheet_Excel_Reader_Type_CODEPAGE:

    + // echo "Type.CODEPAGE";

    + $codepage =
    $this->_GetInt2d($this->data, $pos+4);

    +

    + switch($codepage)

    + {

    + case 367: // ASCII

    + $this->_codepage = "ASCII";

    + break;

    + case 437: //OEM US

    + $this->_codepage = "CP437";

    + break;

    + case 720: //OEM Arabic

    + // currently not supported by
    libiconv

    + $this->_codepage = "";

    + break;

    + case 737: //OEM Greek

    + $this->_codepage = "CP737";

    + break;

    + case 775: //OEM Baltic

    + $this->_codepage = "CP775";

    + break;

    + case 850: //OEM Latin I

    + $this->_codepage = "CP850";

    + break;

    + case 852: //OEM Latin II (Central
    European)

    + $this->_codepage = "CP852";

    + break;

    + case 855: //OEM Cyrillic

    + $this->_codepage = "CP855";

    + break;

    + case 857: //OEM Turkish

    + $this->_codepage = "CP857";

    + break;

    + case 858: //OEM Multilingual Latin
    I with Euro

    + $this->_codepage = "CP858";

    + break;

    + case 860: //OEM Portugese

    + $this->_codepage = "CP860";

    + break;

    + case 861: //OEM Icelandic

    + $this->_codepage = "CP861";

    + break;

    + case 862: //OEM Hebrew

    + $this->_codepage = "CP862";

    + break;

    + case 863: //OEM Canadian (French)

    + $this->_codepage = "CP863";

    + break;

    + case 864: //OEM Arabic

    + $this->_codepage = "CP864";

    + break;

    + case 865: //OEM Nordic

    + $this->_codepage = "CP865";

    + break;

    + case 866: //OEM Cyrillic (Russian)

    + $this->_codepage = "CP866";

    + break;

    + case 869: //OEM Greek (Modern)

    + $this->_codepage = "CP869";

    + break;

    + case 874: //ANSI Thai

    + $this->_codepage = "CP874";

    + break;

    + case 932: //ANSI Japanese Shift-JIS

    + $this->_codepage = "CP932";

    + break;

    + case 936: //ANSI Chinese
    Simplified GBK

    + $this->_codepage = "CP936";

    + break;

    + case 949: //ANSI Korean (Wansung)

    + $this->_codepage = "CP949";

    + break;

    + case 950: //ANSI Chinese
    Traditional BIG5

    + $this->_codepage = "CP950";

    + break;

    + case 1200: //UTF-16 (BIFF8)

    + $this->_codepage = "UTF-16LE";

    + break;

    + case 1250:// ANSI Latin II
    (Central European)

    + $this->_codepage = "CP1250";

    + break;

    + case 1251: //ANSI Cyrillic

    + $this->_codepage = "CP1251";

    + break;

    + case 1252: //ANSI Latin I
    (BIFF4-BIFF7)

    + $this->_codepage = "CP1252";

    + break;

    + case 1253: //ANSI Greek

    + $this->_codepage = "CP1253";

    + break;

    + case 1254: //ANSI Turkish

    + $this->_codepage = "CP1254";

    + break;

    + case 1255: //ANSI Hebrew

    + $this->_codepage = "CP1255";

    + break;

    + case 1256: //ANSI Arabic

    + $this->_codepage = "CP1256";

    + break;

    + case 1257: //ANSI Baltic

    + $this->_codepage = "CP1257";

    + break;

    + case 1258: //ANSI Vietnamese

    + $this->_codepage = "CP1258";

    + break;

    + case 1361: //ANSI Korean (Johab)

    + $this->_codepage = "CP1361";

    + break;

    + case 10000: //Apple Roman

    + // currently not supported by
    libiconv

    + $this->_codepage = "";

    + break;

    + case 32768: //Apple Roman

    + // currently not supported by
    libiconv

    + $this->_codepage = "";

    + break;

    + case 32769: //ANSI Latin I
    (BIFF2-BIFF3)

    + // currently not supported by
    libiconv

    + $this->_codepage = "";

    + break;

    + }

    + break;

    +

    }

    @@ -789,9 +921,12 @@
    return $value;

    }

    - function _encodeUTF16($string){

    + function _encodeUTF16($string, $compressed){

    $result = $string;

    if ($this->_defaultEncoding){

    + if($compressed){

    + $string =
    $this->_uncompressByteString($string);

    + }

    switch ($this->_encoderFunction){

    case 'iconv' : $result = iconv('UTF-16LE',
    $this->_defaultEncoding, $string);

    break;

    @@ -806,6 +941,30 @@
    return ord($data[$pos]) | (ord($data[$pos+1]) << 8)
    | (ord($data[$pos+2]) << 16) | (ord($data[$pos+3]) << 24);

    }

    + function _GetInt2d($data, $pos) {

    + return ord($data[$pos]) | (ord($data[$pos+1]) << 8);

    + }

    +

    + function _decodeCodepage($string){

    + $result = $string;

    + if ($this->_defaultEncoding && $this->_codepage){

    + switch ($this->_encoderFunction){

    + case 'iconv' : $result = iconv($this->_codepage,
    $this->_defaultEncoding, $string);

    + break;

    + case 'mb_convert_encoding' : $result =
    mb_convert_encoding($string, $this->_defaultEncoding,
    $this->_codepage );

    + break;

    + }

    + }

    + return $result;

    + }

    +

    + function _uncompressByteString($string){

    + $uncompressedString = "";

    + for($i = 0; $i < strlen($string); $i++){

    + $uncompressedString .= $string[$i]."\0";

    + }

    + return $uncompressedString;

    + }

    }

     
  • Pro-Hvost
    Pro-Hvost
    2005-10-31

    Logged In: YES
    user_id=1370830

    In this code:

    @@ -789,9 +921,12 @@
    return $value;

    }

    - function _encodeUTF16($string){
    + function _encodeUTF16($string, $compressed){
    $result = $string;
    if ($this->_defaultEncoding){
    + if($compressed){
    + $string = $this->_uncompressByteString
    ($string);
    + }
    switch ($this->_encoderFunction){
    case 'iconv' : $result = iconv
    ('UTF-16LE', $this->_defaultEncoding, $string);

    break;

    change

    + function _encodeUTF16($string, $compressed){

    to

    + function _encodeUTF16($string, $compressed = ""){

    This fix "Warning: Missing argument 2 for _encodeutf16() ..."
    message

     
  • Logged In: YES
    user_id=1387986

    Please use the patch for Unicode sheetnames instead of
    changing "_encodeUTF16($string, $compressed)" to
    "_encodeUTF16($string, $compressed = "")".