[Faxpp-devel] SF.net SVN: faxpp: [10] trunk/faxpp
Status: Beta
Brought to you by:
jpcs
From: <jp...@us...> - 2007-08-08 14:02:48
|
Revision: 10 http://faxpp.svn.sourceforge.net/faxpp/?rev=10&view=rev Author: jpcs Date: 2007-08-08 07:02:32 -0700 (Wed, 08 Aug 2007) Log Message: ----------- Started work on the encoding selection framework. Moved the endianness check to be performed by the configure script. Modified Paths: -------------- trunk/faxpp/configure trunk/faxpp/configure.in trunk/faxpp/examples/tokenizer_example.c trunk/faxpp/include/faxpp/parser.h trunk/faxpp/include/faxpp/tokenizer.h trunk/faxpp/src/config.h.in trunk/faxpp/src/xml_parser.c trunk/faxpp/src/xml_parser.h trunk/faxpp/src/xml_tokenizer.c Modified: trunk/faxpp/configure =================================================================== --- trunk/faxpp/configure 2007-08-07 12:16:40 UTC (rev 9) +++ trunk/faxpp/configure 2007-08-08 14:02:32 UTC (rev 10) @@ -19826,6 +19826,242 @@ fi +{ echo "$as_me:$LINENO: checking whether byte ordering is bigendian" >&5 +echo $ECHO_N "checking whether byte ordering is bigendian... $ECHO_C" >&6; } +if test "${ac_cv_c_bigendian+set}" = set; then + echo $ECHO_N "(cached) $ECHO_C" >&6 +else + # See if sys/param.h defines the BYTE_ORDER macro. +cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +#include <sys/types.h> +#include <sys/param.h> + +int +main () +{ +#if ! (defined BYTE_ORDER && defined BIG_ENDIAN && defined LITTLE_ENDIAN \ + && BYTE_ORDER && BIG_ENDIAN && LITTLE_ENDIAN) + bogus endian macros +#endif + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + # It does; now see whether it defined to BIG_ENDIAN or not. +cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +#include <sys/types.h> +#include <sys/param.h> + +int +main () +{ +#if BYTE_ORDER != BIG_ENDIAN + not big endian +#endif + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_cv_c_bigendian=yes +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_cv_c_bigendian=no +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + # It does not; compile a test program. +if test "$cross_compiling" = yes; then + # try to guess the endianness by grepping values into an object file + ac_cv_c_bigendian=unknown + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +short int ascii_mm[] = { 0x4249, 0x4765, 0x6E44, 0x6961, 0x6E53, 0x7953, 0 }; +short int ascii_ii[] = { 0x694C, 0x5454, 0x656C, 0x6E45, 0x6944, 0x6E61, 0 }; +void _ascii () { char *s = (char *) ascii_mm; s = (char *) ascii_ii; } +short int ebcdic_ii[] = { 0x89D3, 0xE3E3, 0x8593, 0x95C5, 0x89C4, 0x9581, 0 }; +short int ebcdic_mm[] = { 0xC2C9, 0xC785, 0x95C4, 0x8981, 0x95E2, 0xA8E2, 0 }; +void _ebcdic () { char *s = (char *) ebcdic_mm; s = (char *) ebcdic_ii; } +int +main () +{ + _ascii (); _ebcdic (); + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + if grep BIGenDianSyS conftest.$ac_objext >/dev/null ; then + ac_cv_c_bigendian=yes +fi +if grep LiTTleEnDian conftest.$ac_objext >/dev/null ; then + if test "$ac_cv_c_bigendian" = unknown; then + ac_cv_c_bigendian=no + else + # finding both strings is unlikely to happen, but who knows? + ac_cv_c_bigendian=unknown + fi +fi +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +else + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +$ac_includes_default +int +main () +{ + + /* Are we little or big endian? From Harbison&Steele. */ + union + { + long int l; + char c[sizeof (long int)]; + } u; + u.l = 1; + return u.c[sizeof (long int) - 1] == 1; + + ; + return 0; +} +_ACEOF +rm -f conftest$ac_exeext +if { (ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { ac_try='./conftest$ac_exeext' + { (case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_try") 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + ac_cv_c_bigendian=no +else + echo "$as_me: program exited with status $ac_status" >&5 +echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +( exit $ac_status ) +ac_cv_c_bigendian=yes +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext +fi + + +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ echo "$as_me:$LINENO: result: $ac_cv_c_bigendian" >&5 +echo "${ECHO_T}$ac_cv_c_bigendian" >&6; } +case $ac_cv_c_bigendian in + yes) + +cat >>confdefs.h <<\_ACEOF +#define WORDS_BIGENDIAN 1 +_ACEOF + ;; + no) + ;; + *) + { { echo "$as_me:$LINENO: error: unknown endianness +presetting ac_cv_c_bigendian=no (or yes) will help" >&5 +echo "$as_me: error: unknown endianness +presetting ac_cv_c_bigendian=no (or yes) will help" >&2;} + { (exit 1); exit 1; }; } ;; +esac + { echo "$as_me:$LINENO: checking whether time.h and sys/time.h may both be included" >&5 echo $ECHO_N "checking whether time.h and sys/time.h may both be included... $ECHO_C" >&6; } if test "${ac_cv_header_time+set}" = set; then Modified: trunk/faxpp/configure.in =================================================================== --- trunk/faxpp/configure.in 2007-08-07 12:16:40 UTC (rev 9) +++ trunk/faxpp/configure.in 2007-08-08 14:02:32 UTC (rev 10) @@ -54,6 +54,7 @@ # Checks for typedefs, structures, and compiler characteristics. AC_C_CONST +AC_C_BIGENDIAN AC_HEADER_TIME AC_TYPE_UINT16_T AC_TYPE_UINT32_T Modified: trunk/faxpp/examples/tokenizer_example.c =================================================================== --- trunk/faxpp/examples/tokenizer_example.c 2007-08-07 12:16:40 UTC (rev 9) +++ trunk/faxpp/examples/tokenizer_example.c 2007-08-08 14:02:32 UTC (rev 10) @@ -82,7 +82,11 @@ if(err == PREMATURE_END_OF_BUFFER && length == sizeof(xml)) { // Repopulate the buffer void *buffer_position; - FAXPP_tokenizer_release_buffer(tokenizer, &buffer_position); + err = FAXPP_tokenizer_release_buffer(tokenizer, &buffer_position); + if(err != NO_ERROR) { + printf("ERROR: %s\n", FAXPP_err_to_string(err)); + exit(1); + } if(buffer_position < (void*)xml + sizeof(xml)) { length = (void*)(xml + sizeof(xml)) - buffer_position; @@ -92,7 +96,11 @@ length += fread(xml, 1, sizeof(xml) - length, file); - FAXPP_continue_tokenize(tokenizer, xml, length, length != sizeof(xml)); + err = FAXPP_continue_tokenize(tokenizer, xml, length, length != sizeof(xml)); + if(err != NO_ERROR) { + printf("ERROR: %s\n", FAXPP_err_to_string(err)); + exit(1); + } } else if(err != NO_ERROR) { printf("%03d:%03d ERROR: %s\n", FAXPP_get_tokenizer_error_line(tokenizer), Modified: trunk/faxpp/include/faxpp/parser.h =================================================================== --- trunk/faxpp/include/faxpp/parser.h 2007-08-07 12:16:40 UTC (rev 9) +++ trunk/faxpp/include/faxpp/parser.h 2007-08-08 14:02:32 UTC (rev 10) @@ -103,6 +103,22 @@ typedef unsigned int (*FAXPP_ReadCallback)(void *userData, void *buffer, unsigned int length); /** + * The function called when faxpp reads an encoding declaration in the XML document, or determines + * that the document does not contain an encoding declaration. The function should return a + * FAXPP_DecodeFunction for the encoding, or null if the encoding is not supported. + * + * \param userData The user data supplied to the FAXPP_set_encoding_callback() method + * \param encoding The encoding string found, or null if the document does not contain an encoding + * declaration. + * \param sniffedEncoding The encoding function already in use by the tokenizer, that was determined + * using auto-detection when document parsing began. + * + * \return The FAXPP_DecodeFunction to use to decode the document, or null if the encoding is not supported + */ +typedef FAXPP_DecodeFunction (*FAXPP_EncodingCallback)(void *userData, const FAXPP_Text *encoding, + FAXPP_DecodeFunction sniffedEncoding); + +/** * Creates a parser object * * \param mode The type of checks the parser should perform @@ -154,6 +170,19 @@ void FAXPP_set_encode(FAXPP_Parser *parser, FAXPP_EncodeFunction encode); /** + * Sets the encoding callback function that the parser will call when it reads an + * encoding declaration in the XML document, or determines that the document does + * not contain an encoding declaration. + * + * \param parser + * \param callback The encoding callback function to use, or null to use the default encoding callback + * \param userData The user data to be passed to the callback function when it is called + * + * \relatesalso FAXPP_Parser + */ +void FAXPP_set_encoding_callback(FAXPP_Parser *parser, FAXPP_EncodingCallback callback, void *userData); + +/** * Initialize the parser to parse the given buffer. This will halt any * parse that was already in progress. * @@ -164,6 +193,7 @@ * \param parser The parser to initialize * \param buffer A pointer to the start of the buffer to parse * \param length The length of the given buffer + * \param done Set to non-zero if this is the last buffer from the input * * \retval UNSUPPORTED_ENCODING If the encoding sniffing algorithm cannot recognize * the encoding of the buffer @@ -172,7 +202,7 @@ * * \relatesalso FAXPP_Parser */ -FAXPP_Error FAXPP_init_parse(FAXPP_Parser *parser, void *buffer, unsigned int length); +FAXPP_Error FAXPP_init_parse(FAXPP_Parser *parser, void *buffer, unsigned int length, unsigned int done); /** * Initialize the parser to parse the given file. This will halt any @@ -211,6 +241,43 @@ FAXPP_Error FAXPP_init_parse_callback(FAXPP_Parser *parser, FAXPP_ReadCallback callback, void *userData); /** + * Instructs the parser to release any dependencies it has on it's current buffer. + * + * This is typically called on recieving a PREMATURE_END_OF_BUFFER error, before + * using FAXPP_continue_parse() to provide a new buffer. In this case, the buffer data + * between *buffer_position and the end of the buffer need to be copied into the start of + * the new buffer. + * + * \param parser + * \param[out] buffer_position Set to a pointer in the current buffer that the tokenizer + * has tokenized up to + * + * \retval OUT_OF_MEMORY + * \retval NO_ERROR + * + * \relatesalso FAXPP_Parser + */ +FAXPP_Error FAXPP_release_buffer(FAXPP_Parser *parser, void **buffer_position); + +/** + * Provides a new buffer for the parser to continue parsing. + * + * FAXPP_release_buffer() should have been called before this, + * and the remaining data in the old buffer transferred to the new one. + * + * \param parser + * \param buffer A pointer to the start of the buffer to parse + * \param length The length of the given buffer + * \param done Set to non-zero if this is the last buffer from the input + * + * \retval NO_ERROR + * + * \relatesalso FAXPP_Parser + */ +FAXPP_Error FAXPP_continue_parse(FAXPP_Parser *parser, void *buffer, + unsigned int length, unsigned int done); + +/** * Parses the next event, placing the information for it * into the current event. * Modified: trunk/faxpp/include/faxpp/tokenizer.h =================================================================== --- trunk/faxpp/include/faxpp/tokenizer.h 2007-08-07 12:16:40 UTC (rev 9) +++ trunk/faxpp/include/faxpp/tokenizer.h 2007-08-08 14:02:32 UTC (rev 10) @@ -39,6 +39,7 @@ * \relatesalso FAXPP_Tokenizer */ FAXPP_Tokenizer *FAXPP_create_tokenizer(); + /** * Frees a tokenizer object * @@ -49,6 +50,28 @@ void FAXPP_free_tokenizer(FAXPP_Tokenizer *tokenizer); /** + * Returns the current FAXPP_DecodeFunction that the tokenizer is using. + * + * \param tokenizer + * \return The decode function + * + * \relatesalso FAXPP_Tokenizer + */ +FAXPP_DecodeFunction FAXPP_get_tokenizer_decode(const FAXPP_Tokenizer *tokenizer); + +/** + * Sets the FAXPP_DecodeFunction that the tokenizer uses to decode the XML document. + * This will typically be called when an encoding declaration is read, to switch to + * the correct decode function. + * + * \param tokenizer + * \param decode The decode function + * + * \relatesalso FAXPP_Tokenizer + */ +void FAXPP_set_tokenizer_decode(FAXPP_Tokenizer *tokenizer, FAXPP_DecodeFunction decode); + +/** * Initialize the tokenizer to tokenize the given buffer, returning strings * encoded using the given encoding function. * @@ -93,7 +116,7 @@ * FAXPP_tokenizer_release_buffer() should have been called before this, * and the remaining data in the old buffer transferred to the new one. * - * \param tokenizer The tokenizer to initialize + * \param tokenizer * \param buffer A pointer to the start of the buffer to tokenize * \param length The length of the given buffer * \param done Set to non-zero if this is the last buffer from the input Modified: trunk/faxpp/src/config.h.in =================================================================== --- trunk/faxpp/src/config.h.in 2007-08-07 12:16:40 UTC (rev 9) +++ trunk/faxpp/src/config.h.in 2007-08-08 14:02:32 UTC (rev 10) @@ -3,12 +3,6 @@ /* Define to 1 if you have the <dlfcn.h> header file. */ #undef HAVE_DLFCN_H -/* Define to 1 if you have the <fcntl.h> header file. */ -#undef HAVE_FCNTL_H - -/* Define to 1 if you have the `getpagesize' function. */ -#undef HAVE_GETPAGESIZE - /* Define to 1 if you have the `gettimeofday' function. */ #undef HAVE_GETTIMEOFDAY @@ -28,9 +22,6 @@ /* Define to 1 if you have the `memset' function. */ #undef HAVE_MEMSET -/* Define to 1 if you have a working `mmap' system call. */ -#undef HAVE_MMAP - /* Define to 1 if your system has a GNU libc compatible `realloc' function, and to 0 otherwise. */ #undef HAVE_REALLOC @@ -83,6 +74,10 @@ /* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */ #undef TIME_WITH_SYS_TIME +/* Define to 1 if your processor stores words with the most significant byte + first (like Motorola and SPARC, unlike Intel and VAX). */ +#undef WORDS_BIGENDIAN + /* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>, <pthread.h>, or <semaphore.h> is not used. If the typedef was allowed, the #define below would cause a syntax error. */ Modified: trunk/faxpp/src/xml_parser.c =================================================================== --- trunk/faxpp/src/xml_parser.c 2007-08-07 12:16:40 UTC (rev 9) +++ trunk/faxpp/src/xml_parser.c 2007-08-08 14:02:32 UTC (rev 10) @@ -40,6 +40,14 @@ static FAXPP_Error wf_start_document_next_event(FAXPP_ParserEnv *env); static FAXPP_Error wf_next_event(FAXPP_ParserEnv *env); +static FAXPP_DecodeFunction p_default_encoding_callback(void *userData, const FAXPP_Text *encoding, + FAXPP_DecodeFunction sniffedEncoding) +{ + // TBD implement this - jpcs + // TBD eliminate the callback functions entirely - jpcs + return sniffedEncoding; +} + FAXPP_Parser *FAXPP_create_parser(FAXPP_ParseMode mode, FAXPP_EncodeFunction encode) { FAXPP_ParserEnv *env = malloc(sizeof(FAXPP_ParserEnv)); @@ -48,6 +56,8 @@ env->mode = mode; env->encode = encode; + env->encoding = p_default_encoding_callback; + /* The next_event field is set in p_reset_parser() */ env->max_attr_count = INITIAL_ATTRS_SIZE; @@ -122,6 +132,29 @@ free(env); } +void FAXPP_set_null_terminate(FAXPP_Parser *parser, unsigned int boolean) +{ + parser->null_terminate = boolean != 0; +} + +void FAXPP_set_encode(FAXPP_Parser *parser, FAXPP_EncodeFunction encode) +{ + parser->encode = encode; +} + + +void FAXPP_set_encoding_callback(FAXPP_Parser *parser, FAXPP_EncodingCallback callback, void *userData) +{ + if(callback == 0) { + parser->encoding = p_default_encoding_callback; + parser->encoding_user_data = 0; + } + else { + parser->encoding = callback; + parser->encoding_user_data = userData; + } +} + static FAXPP_Error p_reset_parser(FAXPP_ParserEnv *env, int allocate_buffer) { // Reset the stack buffer cursor @@ -147,7 +180,7 @@ return NO_ERROR; } -FAXPP_Error FAXPP_init_parse(FAXPP_Parser *env, void *buffer, unsigned int length) +FAXPP_Error FAXPP_init_parse(FAXPP_Parser *env, void *buffer, unsigned int length, unsigned int done) { FAXPP_Error err = p_reset_parser(env, /*allocate_buffer*/0); if(err != 0) return err; @@ -155,7 +188,7 @@ env->read = 0; env->read_user_data = 0; - return FAXPP_init_tokenize(&env->tenv, buffer, length, /*done*/1, env->encode); + return FAXPP_init_tokenize(&env->tenv, buffer, length, done, env->encode); } static unsigned int p_file_read_callback(void *userData, void *buffer, unsigned int length) @@ -178,7 +211,6 @@ unsigned int len = env->read(env->read_user_data, env->read_buffer, env->read_buffer_length); - // TBD boolean for indicating this is the last buffer - jpcs return FAXPP_init_tokenize(&env->tenv, env->read_buffer, len, /*done*/len != env->read_buffer_length, env->encode); } @@ -291,14 +323,14 @@ } \ } -static FAXPP_Error p_read_more(FAXPP_ParserEnv *env) +FAXPP_Error FAXPP_release_buffer(FAXPP_Parser *env, void **buffer_position) { - unsigned int len = 0; - unsigned int readlen; unsigned int i; FAXPP_AttrValue *atval; + FAXPP_Error err; - FAXPP_tokenizer_release_buffer(&env->tenv, 0); + err = FAXPP_tokenizer_release_buffer(&env->tenv, buffer_position); + if(err != 0) return err; // Copy any strings in the event which point to the old buffer // into the event_buffer @@ -322,6 +354,24 @@ } } + return NO_ERROR; +} + +FAXPP_Error FAXPP_continue_parse(FAXPP_Parser *env, void *buffer, + unsigned int length, unsigned int done) +{ + return FAXPP_continue_tokenize(&env->tenv, buffer, length, done); +} + +static FAXPP_Error p_read_more(FAXPP_ParserEnv *env) +{ + unsigned int len = 0; + unsigned int readlen; + FAXPP_Error err; + + err = FAXPP_release_buffer(env, 0); + if(err != 0) return err; + if(env->tenv.position < env->tenv.buffer_end) { // We're half way through a charcter, so we need to copy // the partial char to the begining of the buffer to keep @@ -335,7 +385,7 @@ return PREMATURE_END_OF_BUFFER; len += readlen; - return FAXPP_continue_tokenize(&env->tenv, env->read_buffer, len, /*done*/len != env->read_buffer_length); + return FAXPP_continue_parse(env, env->read_buffer, len, /*done*/len != env->read_buffer_length); } #define p_check_err(err, env) \ @@ -601,6 +651,7 @@ static FAXPP_Error nc_start_document_next_event(FAXPP_ParserEnv *env) { FAXPP_Error err = 0; + FAXPP_DecodeFunction decode; p_reset_event(env); @@ -613,15 +664,20 @@ p_set_location_from_token(env); break; case XML_DECL_ENCODING_TOKEN: - // TBD invoke a callback function to change the transcoder p_copy_text_from_token(&env->event.encoding, env, /*useTokenBuffer*/0); break; case XML_DECL_STANDALONE_TOKEN: p_copy_text_from_token(&env->event.standalone, env, /*useTokenBuffer*/0); break; default: + env->buffered_token = 1; + + // Invoke the callback function to change the decoder + decode = env->encoding(env->encoding_user_data, &env->event.encoding, env->tenv.decode); + if(decode == 0) return UNSUPPORTED_ENCODING; + FAXPP_set_tokenizer_decode(&env->tenv, decode); + env->next_event = nc_next_event; - env->buffered_token = 1; env->event.type = START_DOCUMENT_EVENT; return NO_ERROR; } @@ -633,6 +689,8 @@ static FAXPP_Error nc_next_event(FAXPP_ParserEnv *env) { + // TBD keep all state in the FAXPP_ParserEnv to allow progressive parse to work correctly - jpcs + FAXPP_Error err = 0; p_reset_event(env); Modified: trunk/faxpp/src/xml_parser.h =================================================================== --- trunk/faxpp/src/xml_parser.h 2007-08-07 12:16:40 UTC (rev 9) +++ trunk/faxpp/src/xml_parser.h 2007-08-08 14:02:32 UTC (rev 10) @@ -55,6 +55,9 @@ FAXPP_ParseMode mode; FAXPP_EncodeFunction encode; + FAXPP_EncodingCallback encoding; + void *encoding_user_data; + FAXPP_ReadCallback read; void *read_user_data; Modified: trunk/faxpp/src/xml_tokenizer.c =================================================================== --- trunk/faxpp/src/xml_tokenizer.c 2007-08-07 12:16:40 UTC (rev 9) +++ trunk/faxpp/src/xml_tokenizer.c 2007-08-08 14:02:32 UTC (rev 10) @@ -19,6 +19,7 @@ #include "xml_tokenizer.h" #include "tokenizer_states.h" +#include "config.h" #include <faxpp/token.h> /********************* @@ -29,20 +30,11 @@ #define INITIAL_TOKEN_BUFFER_SIZE 64 -static unsigned int native_little_endian() -{ - // A test to see if the machine is natively little endian - // TBD Use configure to figure this out? - jpcs - uint32_t num = 0x00000001; - uint8_t *ptr = (uint8_t*)# - return (unsigned int)*ptr; -} - FAXPP_Error sniff_encoding(FAXPP_TokenizerEnv *env) { // Default encoding is UTF-8 - env->decode = FAXPP_utf8_decode; + FAXPP_set_tokenizer_decode(env, FAXPP_utf8_decode); // Make initial judgement on the encoding unsigned char *buf = (unsigned char*)env->position; @@ -59,8 +51,11 @@ switch(*buf) { case 0x3C: /* 00 00 00 3C UCS-4, big-endian machine (1234 order) */ - if(native_little_endian()) env->decode = FAXPP_ucs4_be_decode; - else env->decode = FAXPP_ucs4_native_decode; +#ifdef WORDS_BIGENDIAN + FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_native_decode); +#else + FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_be_decode); +#endif break; } break; @@ -75,8 +70,11 @@ switch(*buf) { case 0xFF: /* 00 00 FE FF UCS-4, big-endian machine (1234 order) */ - if(native_little_endian()) env->decode = FAXPP_ucs4_be_decode; - else env->decode = FAXPP_ucs4_native_decode; +#ifdef WORDS_BIGENDIAN + FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_native_decode); +#else + FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_be_decode); +#endif // Skip BOM env->position += 4; break; @@ -100,14 +98,11 @@ return UNSUPPORTED_ENCODING; case 0x3F: /* 00 3C 00 3F UTF-16, big-endian */ - if(native_little_endian()) env->decode = FAXPP_utf16_be_decode; - else { - env->decode = FAXPP_utf16_native_decode; - env->start_element_name_state = utf16_start_element_name_state; - env->element_content_state = utf16_element_content_state; - if(env->encode == FAXPP_utf16_native_encode) - env->encode = 0; - } +#ifdef WORDS_BIGENDIAN + FAXPP_set_tokenizer_decode(env, FAXPP_utf16_native_decode); +#else + FAXPP_set_tokenizer_decode(env, FAXPP_utf16_be_decode); +#endif break; } break; @@ -123,8 +118,11 @@ switch(*buf) { case 0x00: /* 3C 00 00 00 UCS-4, little-endian machine (4321 order) */ - if(native_little_endian()) env->decode = FAXPP_ucs4_native_decode; - else env->decode = FAXPP_ucs4_le_decode; +#ifdef WORDS_BIGENDIAN + FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_le_decode); +#else + FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_native_decode); +#endif break; } break; @@ -132,14 +130,11 @@ switch(*buf) { case 0x00: /* 3C 00 3F 00 UTF-16, little-endian */ - if(native_little_endian()) { - env->decode = FAXPP_utf16_native_decode; - env->start_element_name_state = utf16_start_element_name_state; - env->element_content_state = utf16_element_content_state; - if(env->encode == FAXPP_utf16_native_encode) - env->encode = 0; - } - else env->decode = FAXPP_utf16_le_decode; +#ifdef WORDS_BIGENDIAN + FAXPP_set_tokenizer_decode(env, FAXPP_utf16_le_decode); +#else + FAXPP_set_tokenizer_decode(env, FAXPP_utf16_native_decode); +#endif break; } break; @@ -151,9 +146,7 @@ switch(*buf) { case 0x6D: /* 3C 3F 78 6D UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, etc. */ - env->decode = FAXPP_utf8_decode; - if(env->encode == FAXPP_utf8_encode) - env->encode = 0; + FAXPP_set_tokenizer_decode(env, FAXPP_utf8_decode); break; } break; @@ -182,9 +175,7 @@ switch(*buf++) { case 0xBF: /* EF BB BF UTF-8 with byte order mark */ - env->decode = FAXPP_utf8_decode; - if(env->encode == FAXPP_utf8_encode) - env->encode = 0; + FAXPP_set_tokenizer_decode(env, FAXPP_utf8_decode); // Skip BOM env->position += 3; } @@ -202,14 +193,11 @@ return UNSUPPORTED_ENCODING; default: /* FE FF ## ## UTF-16, big-endian */ - if(native_little_endian()) env->decode = FAXPP_utf16_be_decode; - else { - env->decode = FAXPP_utf16_native_decode; - env->start_element_name_state = utf16_start_element_name_state; - env->element_content_state = utf16_element_content_state; - if(env->encode == FAXPP_utf16_native_encode) - env->encode = 0; - } +#ifdef WORDS_BIGENDIAN + FAXPP_set_tokenizer_decode(env, FAXPP_utf16_native_decode); +#else + FAXPP_set_tokenizer_decode(env, FAXPP_utf16_be_decode); +#endif // Skip BOM env->position += 2; break; @@ -217,14 +205,11 @@ break; default: /* FE FF ## ## UTF-16, big-endian */ - if(native_little_endian()) env->decode = FAXPP_utf16_be_decode; - else { - env->decode = FAXPP_utf16_native_decode; - env->start_element_name_state = utf16_start_element_name_state; - env->element_content_state = utf16_element_content_state; - if(env->encode == FAXPP_utf16_native_encode) - env->encode = 0; - } +#ifdef WORDS_BIGENDIAN + FAXPP_set_tokenizer_decode(env, FAXPP_utf16_native_decode); +#else + FAXPP_set_tokenizer_decode(env, FAXPP_utf16_be_decode); +#endif // Skip BOM env->position += 2; break; @@ -240,21 +225,21 @@ switch(*buf) { case 0x00: /* FF FE 00 00 UCS-4, little-endian machine (4321 order) */ - if(native_little_endian()) env->decode = FAXPP_ucs4_native_decode; - else env->decode = FAXPP_ucs4_le_decode; +#ifdef WORDS_BIGENDIAN + FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_le_decode); +#else + FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_native_decode); +#endif // Skip BOM env->position += 4; break; default: /* FF FE ## ## UTF-16, little-endian */ - if(native_little_endian()) { - env->decode = FAXPP_utf16_native_decode; - env->start_element_name_state = utf16_start_element_name_state; - env->element_content_state = utf16_element_content_state; - if(env->encode == FAXPP_utf16_native_encode) - env->encode = 0; - } - else env->decode = FAXPP_utf16_le_decode; +#ifdef WORDS_BIGENDIAN + FAXPP_set_tokenizer_decode(env, FAXPP_utf16_le_decode); +#else + FAXPP_set_tokenizer_decode(env, FAXPP_utf16_native_decode); +#endif // Skip BOM env->position += 2; break; @@ -262,14 +247,11 @@ break; default: /* FF FE ## ## UTF-16, little-endian */ - if(native_little_endian()) { - env->decode = FAXPP_utf16_native_decode; - env->start_element_name_state = utf16_start_element_name_state; - env->element_content_state = utf16_element_content_state; - if(env->encode == FAXPP_utf16_native_encode) - env->encode = 0; - } - else env->decode = FAXPP_utf16_le_decode; +#ifdef WORDS_BIGENDIAN + FAXPP_set_tokenizer_decode(env, FAXPP_utf16_le_decode); +#else + FAXPP_set_tokenizer_decode(env, FAXPP_utf16_native_decode); +#endif // Skip BOM env->position += 2; break; @@ -279,14 +261,48 @@ break; } - if(env->decode == FAXPP_utf8_decode) { - if(env->encode == FAXPP_utf8_encode) - env->encode = 0; - env->start_element_name_state = utf8_start_element_name_state; - env->element_content_state = utf8_element_content_state; + return NO_ERROR; +} + +FAXPP_DecodeFunction +FAXPP_get_tokenizer_decode(const FAXPP_Tokenizer *tokenizer) +{ + return tokenizer->decode; +} + +void +FAXPP_set_tokenizer_decode(FAXPP_Tokenizer *tokenizer, FAXPP_DecodeFunction decode) +{ + if(decode == FAXPP_utf16_native_decode || +#ifdef WORDS_BIGENDIAN + decode == FAXPP_utf16_be_decode +#else + decode == FAXPP_utf16_le_decode +#endif + ) { + tokenizer->decode = FAXPP_utf16_native_decode; + + if(tokenizer->encode == FAXPP_utf16_native_encode) + tokenizer->encode = 0; + + tokenizer->start_element_name_state = utf16_start_element_name_state; + tokenizer->element_content_state = utf16_element_content_state; } + else if(decode == FAXPP_utf8_decode) { + tokenizer->decode = FAXPP_utf8_decode; - return NO_ERROR; + if(tokenizer->encode == FAXPP_utf8_encode) + tokenizer->encode = 0; + + tokenizer->start_element_name_state = utf8_start_element_name_state; + tokenizer->element_content_state = utf8_element_content_state; + } + else { + tokenizer->decode = decode; + + tokenizer->start_element_name_state = default_start_element_name_state; + tokenizer->element_content_state = default_element_content_state; + } } FAXPP_Error This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |