jaenichen@globalpark.de:
Compressed UTF-16 strings in BIFF8 are not converted to
the default encoding because PHP-ExcelReader treats
them like ASCII strings.
Added parsing of codepage-record and uncompressing of
compressed UTF-16 strings to PHP-ExcelReader.
The patchfile is attached to this message.
Logged In: NO
jaenichen@globalpark.de:
Somehow couldn't upload the patchfile, so here it is:
--- Excel/reader.php 2004-10-04 19:54:44.000000000 +0200
+++ reader.php 2005-07-05 15:03:31.000000000 +0200
@@ -42,6 +42,7 @@
define('Spreadsheet_Excel_Reader_Type_UNKNOWN', 0xffff);
define('Spreadsheet_Excel_Reader_Type_NINETEENFOUR', 0x22);
define('Spreadsheet_Excel_Reader_Type_MERGEDCELLS', 0xE5);
+define('Spreadsheet_Excel_Reader_Type_CODEPAGE', 0x42);
define('Spreadsheet_Excel_Reader_utcOffsetDays' , 25569);
define('Spreadsheet_Excel_Reader_utcOffsetDays1904', 24107);
@@ -79,6 +80,7 @@
var $pos;
var $_ole;
var $_defaultEncoding;
+ var $_codepage;
var $_defaultFormat =
Spreadsheet_Excel_Reader_DEF_NUM_FORMAT;
var $_columnsFormat = array();
var $_rowoffset = 1;
@@ -333,7 +335,13 @@
}
}
- $retstr =
($asciiEncoding) ? $retstr : $this->_encodeUTF16($retstr);
+
+ // convert
string according codepage and BIFF version
+ if($version
== Spreadsheet_Excel_Reader_BIFF8)
+ $retstr =
$this->_encodeUTF16($retstr, $asciiEncoding);
+ else
+ $retstr =
$this->_decodeCodepage($retstr);
+
// echo "Str
$i = $retstr\n";
if ($richString){
$spos +=
4 * $formattingRuns;
@@ -455,6 +463,130 @@
'offset'=>$rec_offset);
break;
+ case Spreadsheet_Excel_Reader_Type_CODEPAGE:
+ // echo "Type.CODEPAGE";
+ $codepage =
$this->_GetInt2d($this->data, $pos+4);
+
+ switch($codepage)
+ {
+ case 367: // ASCII
+ $this->_codepage = "ASCII";
+ break;
+ case 437: //OEM US
+ $this->_codepage = "CP437";
+ break;
+ case 720: //OEM Arabic
+ // currently not supported by
libiconv
+ $this->_codepage = "";
+ break;
+ case 737: //OEM Greek
+ $this->_codepage = "CP737";
+ break;
+ case 775: //OEM Baltic
+ $this->_codepage = "CP775";
+ break;
+ case 850: //OEM Latin I
+ $this->_codepage = "CP850";
+ break;
+ case 852: //OEM Latin II (Central
European)
+ $this->_codepage = "CP852";
+ break;
+ case 855: //OEM Cyrillic
+ $this->_codepage = "CP855";
+ break;
+ case 857: //OEM Turkish
+ $this->_codepage = "CP857";
+ break;
+ case 858: //OEM Multilingual Latin
I with Euro
+ $this->_codepage = "CP858";
+ break;
+ case 860: //OEM Portugese
+ $this->_codepage = "CP860";
+ break;
+ case 861: //OEM Icelandic
+ $this->_codepage = "CP861";
+ break;
+ case 862: //OEM Hebrew
+ $this->_codepage = "CP862";
+ break;
+ case 863: //OEM Canadian (French)
+ $this->_codepage = "CP863";
+ break;
+ case 864: //OEM Arabic
+ $this->_codepage = "CP864";
+ break;
+ case 865: //OEM Nordic
+ $this->_codepage = "CP865";
+ break;
+ case 866: //OEM Cyrillic (Russian)
+ $this->_codepage = "CP866";
+ break;
+ case 869: //OEM Greek (Modern)
+ $this->_codepage = "CP869";
+ break;
+ case 874: //ANSI Thai
+ $this->_codepage = "CP874";
+ break;
+ case 932: //ANSI Japanese Shift-JIS
+ $this->_codepage = "CP932";
+ break;
+ case 936: //ANSI Chinese
Simplified GBK
+ $this->_codepage = "CP936";
+ break;
+ case 949: //ANSI Korean (Wansung)
+ $this->_codepage = "CP949";
+ break;
+ case 950: //ANSI Chinese
Traditional BIG5
+ $this->_codepage = "CP950";
+ break;
+ case 1200: //UTF-16 (BIFF8)
+ $this->_codepage = "UTF-16LE";
+ break;
+ case 1250:// ANSI Latin II
(Central European)
+ $this->_codepage = "CP1250";
+ break;
+ case 1251: //ANSI Cyrillic
+ $this->_codepage = "CP1251";
+ break;
+ case 1252: //ANSI Latin I
(BIFF4-BIFF7)
+ $this->_codepage = "CP1252";
+ break;
+ case 1253: //ANSI Greek
+ $this->_codepage = "CP1253";
+ break;
+ case 1254: //ANSI Turkish
+ $this->_codepage = "CP1254";
+ break;
+ case 1255: //ANSI Hebrew
+ $this->_codepage = "CP1255";
+ break;
+ case 1256: //ANSI Arabic
+ $this->_codepage = "CP1256";
+ break;
+ case 1257: //ANSI Baltic
+ $this->_codepage = "CP1257";
+ break;
+ case 1258: //ANSI Vietnamese
+ $this->_codepage = "CP1258";
+ break;
+ case 1361: //ANSI Korean (Johab)
+ $this->_codepage = "CP1361";
+ break;
+ case 10000: //Apple Roman
+ // currently not supported by
libiconv
+ $this->_codepage = "";
+ break;
+ case 32768: //Apple Roman
+ // currently not supported by
libiconv
+ $this->_codepage = "";
+ break;
+ case 32769: //ANSI Latin I
(BIFF2-BIFF3)
+ // currently not supported by
libiconv
+ $this->_codepage = "";
+ break;
+ }
+ break;
+
}
@@ -789,9 +921,12 @@
return $value;
}
- function _encodeUTF16($string){
+ function _encodeUTF16($string, $compressed){
$result = $string;
if ($this->_defaultEncoding){
+ if($compressed){
+ $string =
$this->_uncompressByteString($string);
+ }
switch ($this->_encoderFunction){
case 'iconv' : $result = iconv('UTF-16LE',
$this->_defaultEncoding, $string);
break;
@@ -806,6 +941,30 @@
return ord($data[$pos]) | (ord($data[$pos+1]) << 8)
| (ord($data[$pos+2]) << 16) | (ord($data[$pos+3]) << 24);
}
+ function _GetInt2d($data, $pos) {
+ return ord($data[$pos]) | (ord($data[$pos+1]) << 8);
+ }
+
+ function _decodeCodepage($string){
+ $result = $string;
+ if ($this->_defaultEncoding && $this->_codepage){
+ switch ($this->_encoderFunction){
+ case 'iconv' : $result = iconv($this->_codepage,
$this->_defaultEncoding, $string);
+ break;
+ case 'mb_convert_encoding' : $result =
mb_convert_encoding($string, $this->_defaultEncoding,
$this->_codepage );
+ break;
+ }
+ }
+ return $result;
+ }
+
+ function _uncompressByteString($string){
+ $uncompressedString = "";
+ for($i = 0; $i < strlen($string); $i++){
+ $uncompressedString .= $string[$i]."\0";
+ }
+ return $uncompressedString;
+ }
}
Logged In: YES
user_id=1370830
In this code:
@@ -789,9 +921,12 @@
return $value;
}
- function _encodeUTF16($string){
+ function _encodeUTF16($string, $compressed){
$result = $string;
if ($this->_defaultEncoding){
+ if($compressed){
+ $string = $this->_uncompressByteString
($string);
+ }
switch ($this->_encoderFunction){
case 'iconv' : $result = iconv
('UTF-16LE', $this->_defaultEncoding, $string);
break;
change
+ function _encodeUTF16($string, $compressed){
to
+ function _encodeUTF16($string, $compressed = ""){
This fix "Warning: Missing argument 2 for _encodeutf16() ..."
message
Logged In: YES
user_id=1387986
Please use the patch for Unicode sheetnames instead of
changing "_encodeUTF16($string, $compressed)" to
"_encodeUTF16($string, $compressed = "")".