[Faxpp-devel] SF.net SVN: faxpp: [10] trunk/faxpp

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 10
          http://faxpp.svn.sourceforge.net/faxpp/?rev=10&view=rev
Author:   jpcs
Date:     2007-08-08 07:02:32 -0700 (Wed, 08 Aug 2007)

Log Message:
-----------
Started work on the encoding selection framework.

Moved the endianness check to be performed by the configure script.

Modified Paths:
--------------
    trunk/faxpp/configure
    trunk/faxpp/configure.in
    trunk/faxpp/examples/tokenizer_example.c
    trunk/faxpp/include/faxpp/parser.h
    trunk/faxpp/include/faxpp/tokenizer.h
    trunk/faxpp/src/config.h.in
    trunk/faxpp/src/xml_parser.c
    trunk/faxpp/src/xml_parser.h
    trunk/faxpp/src/xml_tokenizer.c

Modified: trunk/faxpp/configure
===================================================================

--- trunk/faxpp/configure	2007-08-07 12:16:40 UTC (rev 9)
+++ trunk/faxpp/configure	2007-08-08 14:02:32 UTC (rev 10)
@@ -19826,6 +19826,242 @@
 
 fi
 
+{ echo "$as_me:$LINENO: checking whether byte ordering is bigendian" >&5
+echo $ECHO_N "checking whether byte ordering is bigendian... $ECHO_C" >&6; }
+if test "${ac_cv_c_bigendian+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  # See if sys/param.h defines the BYTE_ORDER macro.
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <sys/types.h>
+#include <sys/param.h>
+
+int
+main ()
+{
+#if  ! (defined BYTE_ORDER && defined BIG_ENDIAN && defined LITTLE_ENDIAN \
+	&& BYTE_ORDER && BIG_ENDIAN && LITTLE_ENDIAN)
+ bogus endian macros
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  # It does; now see whether it defined to BIG_ENDIAN or not.
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <sys/types.h>
+#include <sys/param.h>
+
+int
+main ()
+{
+#if BYTE_ORDER != BIG_ENDIAN
+ not big endian
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_cv_c_bigendian=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_c_bigendian=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	# It does not; compile a test program.
+if test "$cross_compiling" = yes; then
+  # try to guess the endianness by grepping values into an object file
+  ac_cv_c_bigendian=unknown
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+short int ascii_mm[] = { 0x4249, 0x4765, 0x6E44, 0x6961, 0x6E53, 0x7953, 0 };
+short int ascii_ii[] = { 0x694C, 0x5454, 0x656C, 0x6E45, 0x6944, 0x6E61, 0 };
+void _ascii () { char *s = (char *) ascii_mm; s = (char *) ascii_ii; }
+short int ebcdic_ii[] = { 0x89D3, 0xE3E3, 0x8593, 0x95C5, 0x89C4, 0x9581, 0 };
+short int ebcdic_mm[] = { 0xC2C9, 0xC785, 0x95C4, 0x8981, 0x95E2, 0xA8E2, 0 };
+void _ebcdic () { char *s = (char *) ebcdic_mm; s = (char *) ebcdic_ii; }
+int
+main ()
+{
+ _ascii (); _ebcdic ();
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  if grep BIGenDianSyS conftest.$ac_objext >/dev/null ; then
+  ac_cv_c_bigendian=yes
+fi
+if grep LiTTleEnDian conftest.$ac_objext >/dev/null ; then
+  if test "$ac_cv_c_bigendian" = unknown; then
+    ac_cv_c_bigendian=no
+  else
+    # finding both strings is unlikely to happen, but who knows?
+    ac_cv_c_bigendian=unknown
+  fi
+fi
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+int
+main ()
+{
+
+  /* Are we little or big endian?  From Harbison&Steele.  */
+  union
+  {
+    long int l;
+    char c[sizeof (long int)];
+  } u;
+  u.l = 1;
+  return u.c[sizeof (long int) - 1] == 1;
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_c_bigendian=no
+else
+  echo "$as_me: program exited with status $ac_status" >&5
+echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+( exit $ac_status )
+ac_cv_c_bigendian=yes
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
+fi
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_c_bigendian" >&5
+echo "${ECHO_T}$ac_cv_c_bigendian" >&6; }
+case $ac_cv_c_bigendian in
+  yes)
+
+cat >>confdefs.h <<\_ACEOF
+#define WORDS_BIGENDIAN 1
+_ACEOF
+ ;;
+  no)
+     ;;
+  *)
+    { { echo "$as_me:$LINENO: error: unknown endianness
+presetting ac_cv_c_bigendian=no (or yes) will help" >&5
+echo "$as_me: error: unknown endianness
+presetting ac_cv_c_bigendian=no (or yes) will help" >&2;}
+   { (exit 1); exit 1; }; } ;;
+esac
+
 { echo "$as_me:$LINENO: checking whether time.h and sys/time.h may both be included" >&5
 echo $ECHO_N "checking whether time.h and sys/time.h may both be included... $ECHO_C" >&6; }
 if test "${ac_cv_header_time+set}" = set; then

Modified: trunk/faxpp/configure.in
===================================================================
--- trunk/faxpp/configure.in	2007-08-07 12:16:40 UTC (rev 9)
+++ trunk/faxpp/configure.in	2007-08-08 14:02:32 UTC (rev 10)
@@ -54,6 +54,7 @@
 
 # Checks for typedefs, structures, and compiler characteristics.
 AC_C_CONST
+AC_C_BIGENDIAN
 AC_HEADER_TIME
 AC_TYPE_UINT16_T
 AC_TYPE_UINT32_T

Modified: trunk/faxpp/examples/tokenizer_example.c
===================================================================
--- trunk/faxpp/examples/tokenizer_example.c	2007-08-07 12:16:40 UTC (rev 9)
+++ trunk/faxpp/examples/tokenizer_example.c	2007-08-08 14:02:32 UTC (rev 10)
@@ -82,7 +82,11 @@
       if(err == PREMATURE_END_OF_BUFFER && length == sizeof(xml)) {
         // Repopulate the buffer
         void *buffer_position;
-        FAXPP_tokenizer_release_buffer(tokenizer, &buffer_position);
+        err = FAXPP_tokenizer_release_buffer(tokenizer, &buffer_position);
+        if(err != NO_ERROR) {
+          printf("ERROR: %s\n", FAXPP_err_to_string(err));
+          exit(1);
+        }
 
         if(buffer_position < (void*)xml + sizeof(xml)) {
           length = (void*)(xml + sizeof(xml)) - buffer_position;
@@ -92,7 +96,11 @@
 
         length += fread(xml, 1, sizeof(xml) - length, file);
 
-        FAXPP_continue_tokenize(tokenizer, xml, length, length != sizeof(xml));
+        err = FAXPP_continue_tokenize(tokenizer, xml, length, length != sizeof(xml));
+        if(err != NO_ERROR) {
+          printf("ERROR: %s\n", FAXPP_err_to_string(err));
+          exit(1);
+        }
       }
       else if(err != NO_ERROR) {
         printf("%03d:%03d ERROR: %s\n", FAXPP_get_tokenizer_error_line(tokenizer),

Modified: trunk/faxpp/include/faxpp/parser.h
===================================================================
--- trunk/faxpp/include/faxpp/parser.h	2007-08-07 12:16:40 UTC (rev 9)
+++ trunk/faxpp/include/faxpp/parser.h	2007-08-08 14:02:32 UTC (rev 10)
@@ -103,6 +103,22 @@
 typedef unsigned int (*FAXPP_ReadCallback)(void *userData, void *buffer, unsigned int length);
 
 /**
+ * The function called when faxpp reads an encoding declaration in the XML document, or determines
+ * that the document does not contain an encoding declaration. The function should return a
+ * FAXPP_DecodeFunction for the encoding, or null if the encoding is not supported.
+ *
+ * \param userData The user data supplied to the FAXPP_set_encoding_callback() method
+ * \param encoding The encoding string found, or null if the document does not contain an encoding
+ * declaration.
+ * \param sniffedEncoding The encoding function already in use by the tokenizer, that was determined
+ * using auto-detection when document parsing began.
+ *
+ * \return The FAXPP_DecodeFunction to use to decode the document, or null if the encoding is not supported
+ */
+typedef FAXPP_DecodeFunction (*FAXPP_EncodingCallback)(void *userData, const FAXPP_Text *encoding,
+                                                       FAXPP_DecodeFunction sniffedEncoding);
+
+/**
  * Creates a parser object
  *
  * \param mode The type of checks the parser should perform
@@ -154,6 +170,19 @@
 void FAXPP_set_encode(FAXPP_Parser *parser, FAXPP_EncodeFunction encode);
 
 /**
+ * Sets the encoding callback function that the parser will call when it reads an
+ * encoding declaration in the XML document, or determines that the document does
+ * not contain an encoding declaration.
+ *
+ * \param parser
+ * \param callback The encoding callback function to use, or null to use the default encoding callback
+ * \param userData The user data to be passed to the callback function when it is called
+ *
+ * \relatesalso FAXPP_Parser
+ */
+void FAXPP_set_encoding_callback(FAXPP_Parser *parser, FAXPP_EncodingCallback callback, void *userData);
+
+/**
  * Initialize the parser to parse the given buffer. This will halt any
  * parse that was already in progress.
  *
@@ -164,6 +193,7 @@
  * \param parser The parser to initialize
  * \param buffer A pointer to the start of the buffer to parse
  * \param length The length of the given buffer
+ * \param done Set to non-zero if this is the last buffer from the input
  *
  * \retval UNSUPPORTED_ENCODING If the encoding sniffing algorithm cannot recognize
  * the encoding of the buffer
@@ -172,7 +202,7 @@
  *
  * \relatesalso FAXPP_Parser
  */
-FAXPP_Error FAXPP_init_parse(FAXPP_Parser *parser, void *buffer, unsigned int length);
+FAXPP_Error FAXPP_init_parse(FAXPP_Parser *parser, void *buffer, unsigned int length, unsigned int done);
 
 /**
  * Initialize the parser to parse the given file. This will halt any
@@ -211,6 +241,43 @@
 FAXPP_Error FAXPP_init_parse_callback(FAXPP_Parser *parser, FAXPP_ReadCallback callback, void *userData);
 
 /**
+ * Instructs the parser to release any dependencies it has on it's current buffer.
+ *
+ * This is typically called on recieving a PREMATURE_END_OF_BUFFER error, before
+ * using FAXPP_continue_parse() to provide a new buffer. In this case, the buffer data
+ * between *buffer_position and the end of the buffer need to be copied into the start of
+ * the new buffer.
+ *
+ * \param parser
+ * \param[out] buffer_position Set to a pointer in the current buffer that the tokenizer
+ * has tokenized up to
+ *
+ * \retval OUT_OF_MEMORY
+ * \retval NO_ERROR
+ *
+ * \relatesalso FAXPP_Parser
+ */
+FAXPP_Error FAXPP_release_buffer(FAXPP_Parser *parser, void **buffer_position);
+
+/**
+ * Provides a new buffer for the parser to continue parsing.
+ *
+ * FAXPP_release_buffer() should have been called before this,
+ * and the remaining data in the old buffer transferred to the new one.
+ * 
+ * \param parser
+ * \param buffer A pointer to the start of the buffer to parse
+ * \param length The length of the given buffer
+ * \param done Set to non-zero if this is the last buffer from the input
+ *
+ * \retval NO_ERROR
+ *
+ * \relatesalso FAXPP_Parser
+ */
+FAXPP_Error FAXPP_continue_parse(FAXPP_Parser *parser, void *buffer,
+                                 unsigned int length, unsigned int done);
+
+/**
  * Parses the next event, placing the information for it
  * into the current event.
  * 

Modified: trunk/faxpp/include/faxpp/tokenizer.h
===================================================================
--- trunk/faxpp/include/faxpp/tokenizer.h	2007-08-07 12:16:40 UTC (rev 9)
+++ trunk/faxpp/include/faxpp/tokenizer.h	2007-08-08 14:02:32 UTC (rev 10)
@@ -39,6 +39,7 @@
  * \relatesalso FAXPP_Tokenizer
  */
 FAXPP_Tokenizer *FAXPP_create_tokenizer();
+
 /**
  * Frees a tokenizer object
  *
@@ -49,6 +50,28 @@
 void FAXPP_free_tokenizer(FAXPP_Tokenizer *tokenizer);
 
 /**
+ * Returns the current FAXPP_DecodeFunction that the tokenizer is using.
+ * 
+ * \param tokenizer
+ * \return The decode function
+ *
+ * \relatesalso FAXPP_Tokenizer
+ */
+FAXPP_DecodeFunction FAXPP_get_tokenizer_decode(const FAXPP_Tokenizer *tokenizer);
+
+/**
+ * Sets the FAXPP_DecodeFunction that the tokenizer uses to decode the XML document.
+ * This will typically be called when an encoding declaration is read, to switch to
+ * the correct decode function.
+ * 
+ * \param tokenizer
+ * \param decode The decode function
+ *
+ * \relatesalso FAXPP_Tokenizer
+ */
+void FAXPP_set_tokenizer_decode(FAXPP_Tokenizer *tokenizer, FAXPP_DecodeFunction decode);
+
+/**
  * Initialize the tokenizer to tokenize the given buffer, returning strings
  * encoded using the given encoding function.
  *
@@ -93,7 +116,7 @@
  * FAXPP_tokenizer_release_buffer() should have been called before this,
  * and the remaining data in the old buffer transferred to the new one.
  * 
- * \param tokenizer The tokenizer to initialize
+ * \param tokenizer
  * \param buffer A pointer to the start of the buffer to tokenize
  * \param length The length of the given buffer
  * \param done Set to non-zero if this is the last buffer from the input

Modified: trunk/faxpp/src/config.h.in
===================================================================
--- trunk/faxpp/src/config.h.in	2007-08-07 12:16:40 UTC (rev 9)
+++ trunk/faxpp/src/config.h.in	2007-08-08 14:02:32 UTC (rev 10)
@@ -3,12 +3,6 @@
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #undef HAVE_DLFCN_H
 
-/* Define to 1 if you have the <fcntl.h> header file. */
-#undef HAVE_FCNTL_H
-
-/* Define to 1 if you have the `getpagesize' function. */
-#undef HAVE_GETPAGESIZE
-
 /* Define to 1 if you have the `gettimeofday' function. */
 #undef HAVE_GETTIMEOFDAY
 
@@ -28,9 +22,6 @@
 /* Define to 1 if you have the `memset' function. */
 #undef HAVE_MEMSET
 
-/* Define to 1 if you have a working `mmap' system call. */
-#undef HAVE_MMAP
-
 /* Define to 1 if your system has a GNU libc compatible `realloc' function,
    and to 0 otherwise. */
 #undef HAVE_REALLOC
@@ -83,6 +74,10 @@
 /* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */
 #undef TIME_WITH_SYS_TIME
 
+/* Define to 1 if your processor stores words with the most significant byte
+   first (like Motorola and SPARC, unlike Intel and VAX). */
+#undef WORDS_BIGENDIAN
+
 /* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
    <pthread.h>, or <semaphore.h> is not used. If the typedef was allowed, the
    #define below would cause a syntax error. */

Modified: trunk/faxpp/src/xml_parser.c
===================================================================
--- trunk/faxpp/src/xml_parser.c	2007-08-07 12:16:40 UTC (rev 9)
+++ trunk/faxpp/src/xml_parser.c	2007-08-08 14:02:32 UTC (rev 10)
@@ -40,6 +40,14 @@
 static FAXPP_Error wf_start_document_next_event(FAXPP_ParserEnv *env);
 static FAXPP_Error wf_next_event(FAXPP_ParserEnv *env);
 
+static FAXPP_DecodeFunction p_default_encoding_callback(void *userData, const FAXPP_Text *encoding,
+                                                        FAXPP_DecodeFunction sniffedEncoding)
+{
+  // TBD implement this - jpcs
+  // TBD eliminate the callback functions entirely - jpcs
+  return sniffedEncoding;
+}
+
 FAXPP_Parser *FAXPP_create_parser(FAXPP_ParseMode mode, FAXPP_EncodeFunction encode)
 {
   FAXPP_ParserEnv *env = malloc(sizeof(FAXPP_ParserEnv));
@@ -48,6 +56,8 @@
   env->mode = mode;
   env->encode = encode;
 
+  env->encoding = p_default_encoding_callback;
+
   /* The next_event field is set in p_reset_parser() */
 
   env->max_attr_count = INITIAL_ATTRS_SIZE;
@@ -122,6 +132,29 @@
   free(env);
 }
 
+void FAXPP_set_null_terminate(FAXPP_Parser *parser, unsigned int boolean)
+{
+  parser->null_terminate = boolean != 0;
+}
+
+void FAXPP_set_encode(FAXPP_Parser *parser, FAXPP_EncodeFunction encode)
+{
+  parser->encode = encode;
+}
+
+
+void FAXPP_set_encoding_callback(FAXPP_Parser *parser, FAXPP_EncodingCallback callback, void *userData)
+{
+  if(callback == 0) {
+    parser->encoding = p_default_encoding_callback;
+    parser->encoding_user_data = 0;
+  }
+  else {
+    parser->encoding = callback;
+    parser->encoding_user_data = userData;
+  }
+}
+
 static FAXPP_Error p_reset_parser(FAXPP_ParserEnv *env, int allocate_buffer)
 {
   // Reset the stack buffer cursor
@@ -147,7 +180,7 @@
   return NO_ERROR;
 }
 
-FAXPP_Error FAXPP_init_parse(FAXPP_Parser *env, void *buffer, unsigned int length)
+FAXPP_Error FAXPP_init_parse(FAXPP_Parser *env, void *buffer, unsigned int length, unsigned int done)
 {
   FAXPP_Error err = p_reset_parser(env, /*allocate_buffer*/0);
   if(err != 0) return err;
@@ -155,7 +188,7 @@
   env->read = 0;
   env->read_user_data = 0;
 
-  return FAXPP_init_tokenize(&env->tenv, buffer, length, /*done*/1, env->encode);
+  return FAXPP_init_tokenize(&env->tenv, buffer, length, done, env->encode);
 }
 
 static unsigned int p_file_read_callback(void *userData, void *buffer, unsigned int length)
@@ -178,7 +211,6 @@
 
   unsigned int len = env->read(env->read_user_data, env->read_buffer, env->read_buffer_length);
 
-  // TBD boolean for indicating this is the last buffer - jpcs
   return FAXPP_init_tokenize(&env->tenv, env->read_buffer, len, /*done*/len != env->read_buffer_length, env->encode);
 }
 
@@ -291,14 +323,14 @@
   } \
 }
 
-static FAXPP_Error p_read_more(FAXPP_ParserEnv *env)
+FAXPP_Error FAXPP_release_buffer(FAXPP_Parser *env, void **buffer_position)
 {
-  unsigned int len = 0;
-  unsigned int readlen;
   unsigned int i;
   FAXPP_AttrValue *atval;
+  FAXPP_Error err;
 
-  FAXPP_tokenizer_release_buffer(&env->tenv, 0);
+  err = FAXPP_tokenizer_release_buffer(&env->tenv, buffer_position);
+  if(err != 0) return err;
 
   // Copy any strings in the event which point to the old buffer
   // into the event_buffer
@@ -322,6 +354,24 @@
     }
   }
 
+  return NO_ERROR;
+}
+
+FAXPP_Error FAXPP_continue_parse(FAXPP_Parser *env, void *buffer,
+                                 unsigned int length, unsigned int done)
+{
+  return FAXPP_continue_tokenize(&env->tenv, buffer, length, done);
+}
+
+static FAXPP_Error p_read_more(FAXPP_ParserEnv *env)
+{
+  unsigned int len = 0;
+  unsigned int readlen;
+  FAXPP_Error err;
+
+  err = FAXPP_release_buffer(env, 0);
+  if(err != 0) return err;
+
   if(env->tenv.position < env->tenv.buffer_end) {
     // We're half way through a charcter, so we need to copy
     // the partial char to the begining of the buffer to keep
@@ -335,7 +385,7 @@
     return PREMATURE_END_OF_BUFFER;
 
   len += readlen;
-  return FAXPP_continue_tokenize(&env->tenv, env->read_buffer, len, /*done*/len != env->read_buffer_length);
+  return FAXPP_continue_parse(env, env->read_buffer, len, /*done*/len != env->read_buffer_length);
 }
 
 #define p_check_err(err, env) \
@@ -601,6 +651,7 @@
 static FAXPP_Error nc_start_document_next_event(FAXPP_ParserEnv *env)
 {
   FAXPP_Error err = 0;
+  FAXPP_DecodeFunction decode;
 
   p_reset_event(env);
 
@@ -613,15 +664,20 @@
       p_set_location_from_token(env);
       break;
     case XML_DECL_ENCODING_TOKEN:
-      // TBD invoke a callback function to change the transcoder
       p_copy_text_from_token(&env->event.encoding, env, /*useTokenBuffer*/0);
       break;
     case XML_DECL_STANDALONE_TOKEN:
       p_copy_text_from_token(&env->event.standalone, env, /*useTokenBuffer*/0);
       break;
     default:
+      env->buffered_token = 1;
+
+      // Invoke the callback function to change the decoder
+      decode = env->encoding(env->encoding_user_data, &env->event.encoding, env->tenv.decode);
+      if(decode == 0) return UNSUPPORTED_ENCODING;
+      FAXPP_set_tokenizer_decode(&env->tenv, decode);
+
       env->next_event = nc_next_event;
-      env->buffered_token = 1;
       env->event.type = START_DOCUMENT_EVENT;
       return NO_ERROR;
     }
@@ -633,6 +689,8 @@
 
 static FAXPP_Error nc_next_event(FAXPP_ParserEnv *env)
 {
+  // TBD keep all state in the FAXPP_ParserEnv to allow progressive parse to work correctly - jpcs
+
   FAXPP_Error err = 0;
 
   p_reset_event(env);

Modified: trunk/faxpp/src/xml_parser.h
===================================================================
--- trunk/faxpp/src/xml_parser.h	2007-08-07 12:16:40 UTC (rev 9)
+++ trunk/faxpp/src/xml_parser.h	2007-08-08 14:02:32 UTC (rev 10)
@@ -55,6 +55,9 @@
   FAXPP_ParseMode mode;
   FAXPP_EncodeFunction encode;
 
+  FAXPP_EncodingCallback encoding;
+  void *encoding_user_data;
+
   FAXPP_ReadCallback read;
   void *read_user_data;
 

Modified: trunk/faxpp/src/xml_tokenizer.c
===================================================================
--- trunk/faxpp/src/xml_tokenizer.c	2007-08-07 12:16:40 UTC (rev 9)
+++ trunk/faxpp/src/xml_tokenizer.c	2007-08-08 14:02:32 UTC (rev 10)
@@ -19,6 +19,7 @@
 
 #include "xml_tokenizer.h"
 #include "tokenizer_states.h"
+#include "config.h"
 #include <faxpp/token.h>
 
 /*********************
@@ -29,20 +30,11 @@
 
 #define INITIAL_TOKEN_BUFFER_SIZE 64
 
-static unsigned int native_little_endian()
-{
-  // A test to see if the machine is natively little endian
-  // TBD Use configure to figure this out? - jpcs
-  uint32_t num = 0x00000001;
-  uint8_t *ptr = (uint8_t*)&num;
-  return (unsigned int)*ptr;
-}
-
 FAXPP_Error
 sniff_encoding(FAXPP_TokenizerEnv *env)
 {
   // Default encoding is UTF-8
-  env->decode = FAXPP_utf8_decode;
+  FAXPP_set_tokenizer_decode(env, FAXPP_utf8_decode);
 
   // Make initial judgement on the encoding
   unsigned char *buf = (unsigned char*)env->position;
@@ -59,8 +51,11 @@
         switch(*buf) {
         case 0x3C:
           /* 00 00 00 3C  UCS-4, big-endian machine (1234 order) */
-          if(native_little_endian()) env->decode = FAXPP_ucs4_be_decode;
-          else env->decode = FAXPP_ucs4_native_decode;
+#ifdef WORDS_BIGENDIAN
+          FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_native_decode);
+#else
+          FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_be_decode);
+#endif
           break;
         }
         break;
@@ -75,8 +70,11 @@
         switch(*buf) {
         case 0xFF:
           /* 00 00 FE FF  UCS-4, big-endian machine (1234 order) */
-          if(native_little_endian()) env->decode = FAXPP_ucs4_be_decode;
-          else env->decode = FAXPP_ucs4_native_decode;
+#ifdef WORDS_BIGENDIAN
+          FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_native_decode);
+#else
+          FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_be_decode);
+#endif
           // Skip BOM
           env->position += 4;
           break;
@@ -100,14 +98,11 @@
           return UNSUPPORTED_ENCODING;
         case 0x3F:
           /* 00 3C 00 3F  UTF-16, big-endian */
-          if(native_little_endian()) env->decode = FAXPP_utf16_be_decode;
-          else {
-            env->decode = FAXPP_utf16_native_decode;
-            env->start_element_name_state = utf16_start_element_name_state;
-            env->element_content_state = utf16_element_content_state;
-            if(env->encode == FAXPP_utf16_native_encode)
-              env->encode = 0;
-          }
+#ifdef WORDS_BIGENDIAN
+          FAXPP_set_tokenizer_decode(env, FAXPP_utf16_native_decode);
+#else
+          FAXPP_set_tokenizer_decode(env, FAXPP_utf16_be_decode);
+#endif
           break;
         }
         break;
@@ -123,8 +118,11 @@
         switch(*buf) {
         case 0x00:
           /* 3C 00 00 00  UCS-4, little-endian machine (4321 order) */
-          if(native_little_endian()) env->decode = FAXPP_ucs4_native_decode;
-          else env->decode = FAXPP_ucs4_le_decode;
+#ifdef WORDS_BIGENDIAN
+          FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_le_decode);
+#else
+          FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_native_decode);
+#endif
           break;
         }
         break;
@@ -132,14 +130,11 @@
         switch(*buf) {
         case 0x00:
           /* 3C 00 3F 00  UTF-16, little-endian */
-          if(native_little_endian()) {
-            env->decode = FAXPP_utf16_native_decode;
-            env->start_element_name_state = utf16_start_element_name_state;
-            env->element_content_state = utf16_element_content_state;
-            if(env->encode == FAXPP_utf16_native_encode)
-              env->encode = 0;
-          }
-          else env->decode = FAXPP_utf16_le_decode;
+#ifdef WORDS_BIGENDIAN
+          FAXPP_set_tokenizer_decode(env, FAXPP_utf16_le_decode);
+#else
+          FAXPP_set_tokenizer_decode(env, FAXPP_utf16_native_decode);
+#endif
           break;
         }
         break;
@@ -151,9 +146,7 @@
         switch(*buf) {
         case 0x6D:
           /* 3C 3F 78 6D  UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, etc. */
-          env->decode = FAXPP_utf8_decode;
-          if(env->encode == FAXPP_utf8_encode)
-            env->encode = 0;
+          FAXPP_set_tokenizer_decode(env, FAXPP_utf8_decode);
           break;
         }
         break;
@@ -182,9 +175,7 @@
       switch(*buf++) {
       case 0xBF:
         /* EF BB BF  UTF-8 with byte order mark */
-        env->decode = FAXPP_utf8_decode;
-        if(env->encode == FAXPP_utf8_encode)
-          env->encode = 0;
+        FAXPP_set_tokenizer_decode(env, FAXPP_utf8_decode);
         // Skip BOM
         env->position += 3;
       }
@@ -202,14 +193,11 @@
           return UNSUPPORTED_ENCODING;
         default:
           /* FE FF ## ##  UTF-16, big-endian */
-          if(native_little_endian()) env->decode = FAXPP_utf16_be_decode;
-          else {
-            env->decode = FAXPP_utf16_native_decode;
-            env->start_element_name_state = utf16_start_element_name_state;
-            env->element_content_state = utf16_element_content_state;
-            if(env->encode == FAXPP_utf16_native_encode)
-              env->encode = 0;
-          }
+#ifdef WORDS_BIGENDIAN
+          FAXPP_set_tokenizer_decode(env, FAXPP_utf16_native_decode);
+#else
+          FAXPP_set_tokenizer_decode(env, FAXPP_utf16_be_decode);
+#endif
           // Skip BOM
           env->position += 2;
           break;
@@ -217,14 +205,11 @@
         break;
       default:
         /* FE FF ## ##  UTF-16, big-endian */
-        if(native_little_endian()) env->decode = FAXPP_utf16_be_decode;
-        else {
-          env->decode = FAXPP_utf16_native_decode;
-          env->start_element_name_state = utf16_start_element_name_state;
-          env->element_content_state = utf16_element_content_state;
-          if(env->encode == FAXPP_utf16_native_encode)
-            env->encode = 0;
-        }
+#ifdef WORDS_BIGENDIAN
+        FAXPP_set_tokenizer_decode(env, FAXPP_utf16_native_decode);
+#else
+        FAXPP_set_tokenizer_decode(env, FAXPP_utf16_be_decode);
+#endif
         // Skip BOM
         env->position += 2;
         break;
@@ -240,21 +225,21 @@
         switch(*buf) {
         case 0x00:
           /* FF FE 00 00  UCS-4, little-endian machine (4321 order) */
-          if(native_little_endian()) env->decode = FAXPP_ucs4_native_decode;
-          else env->decode = FAXPP_ucs4_le_decode;
+#ifdef WORDS_BIGENDIAN
+          FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_le_decode);
+#else
+          FAXPP_set_tokenizer_decode(env, FAXPP_ucs4_native_decode);
+#endif
           // Skip BOM
           env->position += 4;
           break;
         default:
           /* FF FE ## ##  UTF-16, little-endian */
-          if(native_little_endian()) {
-            env->decode = FAXPP_utf16_native_decode;
-            env->start_element_name_state = utf16_start_element_name_state;
-            env->element_content_state = utf16_element_content_state;
-            if(env->encode == FAXPP_utf16_native_encode)
-              env->encode = 0;
-          }
-          else env->decode = FAXPP_utf16_le_decode;
+#ifdef WORDS_BIGENDIAN
+          FAXPP_set_tokenizer_decode(env, FAXPP_utf16_le_decode);
+#else
+          FAXPP_set_tokenizer_decode(env, FAXPP_utf16_native_decode);
+#endif
           // Skip BOM
           env->position += 2;
           break;
@@ -262,14 +247,11 @@
         break;
       default:
         /* FF FE ## ##  UTF-16, little-endian */
-        if(native_little_endian()) {
-          env->decode = FAXPP_utf16_native_decode;
-          env->start_element_name_state = utf16_start_element_name_state;
-          env->element_content_state = utf16_element_content_state;
-          if(env->encode == FAXPP_utf16_native_encode)
-            env->encode = 0;
-        }
-        else env->decode = FAXPP_utf16_le_decode;
+#ifdef WORDS_BIGENDIAN
+        FAXPP_set_tokenizer_decode(env, FAXPP_utf16_le_decode);
+#else
+        FAXPP_set_tokenizer_decode(env, FAXPP_utf16_native_decode);
+#endif
         // Skip BOM
         env->position += 2;
         break;
@@ -279,14 +261,48 @@
     break;
   }
 
-  if(env->decode == FAXPP_utf8_decode) {
-    if(env->encode == FAXPP_utf8_encode)
-      env->encode = 0;
-    env->start_element_name_state = utf8_start_element_name_state;
-    env->element_content_state = utf8_element_content_state;
+  return NO_ERROR;
+}
+
+FAXPP_DecodeFunction
+FAXPP_get_tokenizer_decode(const FAXPP_Tokenizer *tokenizer)
+{
+  return tokenizer->decode;
+}
+
+void
+FAXPP_set_tokenizer_decode(FAXPP_Tokenizer *tokenizer, FAXPP_DecodeFunction decode)
+{
+  if(decode == FAXPP_utf16_native_decode ||
+#ifdef WORDS_BIGENDIAN
+     decode == FAXPP_utf16_be_decode
+#else
+     decode == FAXPP_utf16_le_decode
+#endif
+     ) {
+    tokenizer->decode = FAXPP_utf16_native_decode;
+
+    if(tokenizer->encode == FAXPP_utf16_native_encode)
+      tokenizer->encode = 0;
+
+    tokenizer->start_element_name_state = utf16_start_element_name_state;
+    tokenizer->element_content_state = utf16_element_content_state;
   }
+  else if(decode == FAXPP_utf8_decode) {
+    tokenizer->decode = FAXPP_utf8_decode;
 
-  return NO_ERROR;
+    if(tokenizer->encode == FAXPP_utf8_encode)
+      tokenizer->encode = 0;
+
+    tokenizer->start_element_name_state = utf8_start_element_name_state;
+    tokenizer->element_content_state = utf8_element_content_state;
+  }
+  else {
+    tokenizer->decode = decode;
+
+    tokenizer->start_element_name_state = default_start_element_name_state;
+    tokenizer->element_content_state = default_element_content_state;
+  }
 }
 
 FAXPP_Error


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.