[Faxpp-devel] SF.net SVN: faxpp: [19] trunk/faxpp
Status: Beta
Brought to you by:
jpcs
|
From: <jp...@us...> - 2007-08-19 22:55:50
|
Revision: 19
http://faxpp.svn.sourceforge.net/faxpp/?rev=19&view=rev
Author: jpcs
Date: 2007-08-19 15:55:48 -0700 (Sun, 19 Aug 2007)
Log Message:
-----------
Implemented attribute value whitespace normalization. This happens in
the tokenizer, which will copy an attribute value to it's buffer if it
finds a whitespace character that needs normalizing.
Moved the parser encoding and attribute value normalization state to
the tokenizer, so it isn't repeated.
Modified Paths:
--------------
trunk/faxpp/TODO
trunk/faxpp/examples/tokenizer_example.c
trunk/faxpp/include/faxpp/tokenizer.h
trunk/faxpp/src/attr_states.h
trunk/faxpp/src/tokenizer_states.h
trunk/faxpp/src/xml_parser.c
trunk/faxpp/src/xml_parser.h
trunk/faxpp/src/xml_tokenizer.c
trunk/faxpp/src/xml_tokenizer.h
Modified: trunk/faxpp/TODO
===================================================================
--- trunk/faxpp/TODO 2007-08-15 00:33:19 UTC (rev 18)
+++ trunk/faxpp/TODO 2007-08-19 22:55:48 UTC (rev 19)
@@ -1,7 +1,6 @@
Small tasks
-----------
-Normalize whitespace in attribute values
Normalize line endings in element character content / PI values / comment values
Accept XML 1.1 line endings as whitespace
Handle "xml" namespace properly
Modified: trunk/faxpp/examples/tokenizer_example.c
===================================================================
--- trunk/faxpp/examples/tokenizer_example.c 2007-08-15 00:33:19 UTC (rev 18)
+++ trunk/faxpp/examples/tokenizer_example.c 2007-08-19 22:55:48 UTC (rev 19)
@@ -52,7 +52,7 @@
exit(-1);
}
- FAXPP_Tokenizer *tokenizer = FAXPP_create_tokenizer();
+ FAXPP_Tokenizer *tokenizer = FAXPP_create_tokenizer(FAXPP_utf8_encode);
if(tokenizer == 0) {
printf("ERROR: out of memory\n");
exit(1);
@@ -70,7 +70,7 @@
length = fread(xml, 1, sizeof(xml), file);
- err = FAXPP_init_tokenize(tokenizer, xml, length, length != sizeof(xml), FAXPP_utf8_encode);
+ err = FAXPP_init_tokenize(tokenizer, xml, length, length != sizeof(xml));
if(err != NO_ERROR) {
printf("ERROR: %s\n", FAXPP_err_to_string(err));
exit(1);
Modified: trunk/faxpp/include/faxpp/tokenizer.h
===================================================================
--- trunk/faxpp/include/faxpp/tokenizer.h 2007-08-15 00:33:19 UTC (rev 18)
+++ trunk/faxpp/include/faxpp/tokenizer.h 2007-08-19 22:55:48 UTC (rev 19)
@@ -34,11 +34,12 @@
/**
* Creates a tokenizer object
*
+ * \param encode The encoding function to use when encoding token values
* \return A pointer to the tokenizer object, or 0 if out of memory.
*
* \relatesalso FAXPP_Tokenizer
*/
-FAXPP_Tokenizer *FAXPP_create_tokenizer();
+FAXPP_Tokenizer *FAXPP_create_tokenizer(FAXPP_EncodeFunction encode);
/**
* Frees a tokenizer object
@@ -79,7 +80,6 @@
* \param buffer A pointer to the start of the buffer to tokenize
* \param length The length of the given buffer
* \param done Set to non-zero if this is the last buffer from the input
- * \param encode The encoding function to use when encoding token values
*
* \retval UNSUPPORTED_ENCODING If the encoding sniffing algorithm cannot recognize
* the encoding of the buffer
@@ -88,8 +88,7 @@
* \relatesalso FAXPP_Tokenizer
*/
FAXPP_Error FAXPP_init_tokenize(FAXPP_Tokenizer *tokenizer, void *buffer,
- unsigned int length, unsigned int done,
- FAXPP_EncodeFunction encode);
+ unsigned int length, unsigned int done);
/**
* Instructs the tokenizer to release any dependencies it has on it's current buffer.
Modified: trunk/faxpp/src/attr_states.h
===================================================================
--- trunk/faxpp/src/attr_states.h 2007-08-15 00:33:19 UTC (rev 18)
+++ trunk/faxpp/src/attr_states.h 2007-08-19 22:55:48 UTC (rev 19)
@@ -280,8 +280,14 @@
case '<':
next_char(env);
return INVALID_CHAR_IN_ATTRIBUTE;
- WHITESPACE:
- env->current_char = ' ';
+ LINE_ENDINGS
+ case '\t':
+ if(env->normalize_attrs) {
+ // Move the token to the buffer, to normalize it
+ FAXPP_Error err = FAXPP_tokenizer_release_buffer(env, 0);
+ if(err != NO_ERROR) return err;
+ env->current_char = ' ';
+ }
break;
default:
DEFAULT_CASE;
@@ -335,9 +341,16 @@
case '<':
next_char(env);
return INVALID_CHAR_IN_ATTRIBUTE;
- WHITESPACE:
- env->current_char = ' ';
+ LINE_ENDINGS
+ case '\t': {
+ if(env->normalize_attrs) {
+ // Move the token to the buffer, to normalize it
+ FAXPP_Error err = FAXPP_tokenizer_release_buffer(env, 0);
+ if(err != NO_ERROR) return err;
+ env->current_char = ' ';
+ }
break;
+ }
default:
DEFAULT_CASE;
Modified: trunk/faxpp/src/tokenizer_states.h
===================================================================
--- trunk/faxpp/src/tokenizer_states.h 2007-08-15 00:33:19 UTC (rev 18)
+++ trunk/faxpp/src/tokenizer_states.h 2007-08-19 22:55:48 UTC (rev 19)
@@ -276,12 +276,7 @@
#define next_char(env) \
{ \
if((env)->token_buffer.cursor) { \
- FAXPP_Error err; \
- if((env)->encode) { \
- err = FAXPP_buffer_append_ch(&(env)->token_buffer, (env)->encode, (env)->current_char); \
- } else { \
- err = FAXPP_buffer_append(&(env)->token_buffer, (env)->position, (env)->char_len); \
- } \
+ FAXPP_Error err = FAXPP_buffer_append_ch(&(env)->token_buffer, (env)->encode, (env)->current_char); \
if(err != 0) return err; \
} \
\
Modified: trunk/faxpp/src/xml_parser.c
===================================================================
--- trunk/faxpp/src/xml_parser.c 2007-08-15 00:33:19 UTC (rev 18)
+++ trunk/faxpp/src/xml_parser.c 2007-08-19 22:55:48 UTC (rev 19)
@@ -32,7 +32,7 @@
#define INITIAL_EVENT_BUFFER_SIZE 256
#define INITIAL_STACK_BUFFER_SIZE 1024
-FAXPP_Error init_tokenizer_internal(FAXPP_TokenizerEnv *env);
+FAXPP_Error init_tokenizer_internal(FAXPP_TokenizerEnv *env, FAXPP_EncodeFunction encode);
void free_tokenizer_internal(FAXPP_TokenizerEnv *env);
static FAXPP_Error nc_next_event(FAXPP_ParserEnv *env);
@@ -48,21 +48,6 @@
FAXPP_ParserEnv *env = malloc(sizeof(FAXPP_ParserEnv));
memset(env, 0, sizeof(FAXPP_ParserEnv));
- env->encode = encode;
-
- switch(mode) {
- case NO_CHECKS_PARSE_MODE:
- env->main_next_event = nc_next_event;
- env->normalize_attrs = 0;
- break;
- case WELL_FORMED_PARSE_MODE:
- env->main_next_event = wf_next_event;
- env->normalize_attrs = 1;
- break;
- }
-
- env->next_event = env->main_next_event;
-
env->max_attr_count = INITIAL_ATTRS_SIZE;
env->attrs = (FAXPP_Attribute*)malloc(sizeof(FAXPP_Attribute) * INITIAL_ATTRS_SIZE);
if(!env->attrs) {
@@ -80,11 +65,24 @@
return 0;
}
- if(init_tokenizer_internal(&env->tenv) == OUT_OF_MEMORY) {
+ if(init_tokenizer_internal(&env->tenv, encode) == OUT_OF_MEMORY) {
FAXPP_free_parser(env);
return 0;
}
+ switch(mode) {
+ case NO_CHECKS_PARSE_MODE:
+ env->main_next_event = nc_next_event;
+ FAXPP_set_normalize_attrs(env, 0);
+ break;
+ case WELL_FORMED_PARSE_MODE:
+ env->main_next_event = wf_next_event;
+ FAXPP_set_normalize_attrs(env, 1);
+ break;
+ }
+
+ env->next_event = env->main_next_event;
+
return env;
}
@@ -142,12 +140,12 @@
void FAXPP_set_normalize_attrs(FAXPP_Parser *parser, unsigned int boolean)
{
- parser->normalize_attrs = boolean != 0;
+ parser->tenv.normalize_attrs = boolean != 0;
}
void FAXPP_set_encode(FAXPP_Parser *parser, FAXPP_EncodeFunction encode)
{
- parser->encode = encode;
+ parser->tenv.encode = encode;
}
@@ -188,7 +186,7 @@
env->read = 0;
env->read_user_data = 0;
- return FAXPP_init_tokenize(&env->tenv, buffer, length, done, env->encode);
+ return FAXPP_init_tokenize(&env->tenv, buffer, length, done);
}
static unsigned int p_file_read_callback(void *userData, void *buffer, unsigned int length)
@@ -211,7 +209,7 @@
unsigned int len = env->read(env->read_user_data, env->read_buffer, env->read_buffer_length);
- return FAXPP_init_tokenize(&env->tenv, env->read_buffer, len, /*done*/len != env->read_buffer_length, env->encode);
+ return FAXPP_init_tokenize(&env->tenv, env->read_buffer, len, /*done*/len != env->read_buffer_length);
}
FAXPP_Error FAXPP_next_event(FAXPP_Parser *env)
@@ -317,7 +315,7 @@
void *newPtr = (env)->event_buffer.cursor; \
FAXPP_Error err = FAXPP_buffer_append(&(env)->event_buffer, (text)->ptr, (text)->len); \
if((env)->null_terminate && err == 0) \
- err = FAXPP_buffer_append_ch(&(env)->event_buffer, (env)->encode, 0); \
+ err = FAXPP_buffer_append_ch(&(env)->event_buffer, (env)->tenv.encode, 0); \
if(err != 0) return err; \
(text)->ptr = newPtr; \
} \
@@ -455,7 +453,7 @@
(text)->len = (o)->len; \
FAXPP_Error err = FAXPP_buffer_append(&(env)->stack_buffer, (o)->ptr, (o)->len); \
if((env)->null_terminate && err == 0) \
- err = FAXPP_buffer_append_ch(&(env)->stack_buffer, (env)->encode, 0); \
+ err = FAXPP_buffer_append_ch(&(env)->stack_buffer, (env)->tenv.encode, 0); \
if(err != 0) return err; \
/* } else { */ \
/* p_set_text_from_text((text), (o)); */ \
@@ -488,7 +486,7 @@
text->len = buffer->cursor - text->ptr;
if(env->null_terminate)
- return FAXPP_buffer_append_ch(buffer, env->encode, 0);
+ return FAXPP_buffer_append_ch(buffer, env->tenv.encode, 0);
return NO_ERROR;
}
@@ -503,7 +501,7 @@
(text)->ptr = (env)->event_buffer.cursor; \
FAXPP_Error err = FAXPP_buffer_append(&(env)->event_buffer, (env)->tenv.result_token.value.ptr, (env)->tenv.result_token.value.len); \
if((env)->null_terminate && err == 0) \
- err = FAXPP_buffer_append_ch(&(env)->event_buffer, (env)->encode, 0); \
+ err = FAXPP_buffer_append_ch(&(env)->event_buffer, (env)->tenv.encode, 0); \
if(err != 0) return err; \
} \
}
@@ -588,10 +586,10 @@
#define p_set_text_to_char(text, env, ch) \
{ \
(text)->ptr = (env)->event_buffer.cursor; \
- FAXPP_Error err = FAXPP_buffer_append_ch(&(env)->event_buffer, (env)->encode, (ch)); \
+ FAXPP_Error err = FAXPP_buffer_append_ch(&(env)->event_buffer, (env)->tenv.encode, (ch)); \
(text)->len = (env)->event_buffer.cursor - (text)->ptr; \
if((env)->null_terminate && err == 0) \
- err = FAXPP_buffer_append_ch(&(env)->event_buffer, (env)->encode, 0); \
+ err = FAXPP_buffer_append_ch(&(env)->event_buffer, (env)->tenv.encode, 0); \
if(err != 0) return err; \
}
@@ -736,19 +734,19 @@
if(env->event.encoding.ptr == 0) {
env->next_event = env->main_next_event;
}
- else if(p_case_insensitive_equals("UTF-8", env->encode, &env->event.encoding)) {
+ else if(p_case_insensitive_equals("UTF-8", env->tenv.encode, &env->event.encoding)) {
env->next_event = env->main_next_event;
if(env->tenv.decode != FAXPP_utf8_decode)
return BAD_ENCODING;
}
- else if(p_case_insensitive_equals("UTF-16", env->encode, &env->event.encoding)) {
+ else if(p_case_insensitive_equals("UTF-16", env->tenv.encode, &env->event.encoding)) {
env->next_event = env->main_next_event;
if(env->tenv.decode != FAXPP_utf16_le_decode &&
env->tenv.decode != FAXPP_utf16_be_decode &&
env->tenv.decode != FAXPP_utf16_native_decode)
return BAD_ENCODING;
}
- else if(p_case_insensitive_equals("UTF-16LE", env->encode, &env->event.encoding)) {
+ else if(p_case_insensitive_equals("UTF-16LE", env->tenv.encode, &env->event.encoding)) {
env->next_event = env->main_next_event;
if(env->tenv.decode != FAXPP_utf16_le_decode
#ifndef WORDS_BIGENDIAN
@@ -757,7 +755,7 @@
)
return BAD_ENCODING;
}
- else if(p_case_insensitive_equals("UTF-16BE", env->encode, &env->event.encoding)) {
+ else if(p_case_insensitive_equals("UTF-16BE", env->tenv.encode, &env->event.encoding)) {
env->next_event = env->main_next_event;
if(env->tenv.decode != FAXPP_utf16_be_decode
#ifdef WORDS_BIGENDIAN
@@ -766,14 +764,14 @@
)
return BAD_ENCODING;
}
- else if(p_case_insensitive_equals("ISO-10646-UCS-4", env->encode, &env->event.encoding)) {
+ else if(p_case_insensitive_equals("ISO-10646-UCS-4", env->tenv.encode, &env->event.encoding)) {
env->next_event = env->main_next_event;
if(env->tenv.decode != FAXPP_ucs4_le_decode &&
env->tenv.decode != FAXPP_ucs4_be_decode &&
env->tenv.decode != FAXPP_ucs4_native_decode)
return BAD_ENCODING;
}
- else if(p_case_insensitive_equals("ISO-8859-1", env->encode, &env->event.encoding)) {
+ else if(p_case_insensitive_equals("ISO-8859-1", env->tenv.encode, &env->event.encoding)) {
FAXPP_set_decode(env, FAXPP_iso_8859_1_decode);
}
@@ -1231,11 +1229,11 @@
}
attrVal->value.ptr = env->event_buffer.cursor;
- err = FAXPP_buffer_append_ch(&env->event_buffer, env->encode, ch);
+ err = FAXPP_buffer_append_ch(&env->event_buffer, env->tenv.encode, ch);
attrVal->value.len = env->event_buffer.cursor - attrVal->value.ptr;
if(env->null_terminate && err == 0)
- err = FAXPP_buffer_append_ch(&env->event_buffer, env->encode, 0);
+ err = FAXPP_buffer_append_ch(&env->event_buffer, env->tenv.encode, 0);
if(err != 0) return err;
break;
case HEX_CHAR_REFERENCE_EVENT:
@@ -1247,11 +1245,11 @@
}
attrVal->value.ptr = env->event_buffer.cursor;
- err = FAXPP_buffer_append_ch(&env->event_buffer, env->encode, ch);
+ err = FAXPP_buffer_append_ch(&env->event_buffer, env->tenv.encode, ch);
attrVal->value.len = env->event_buffer.cursor - attrVal->value.ptr;
if(env->null_terminate && err == 0)
- err = FAXPP_buffer_append_ch(&env->event_buffer, env->encode, 0);
+ err = FAXPP_buffer_append_ch(&env->event_buffer, env->tenv.encode, 0);
if(err != 0) return err;
break;
default: break;
@@ -1259,7 +1257,7 @@
}
/* Normalize the attribute values if required */
- if(env->normalize_attrs && attr->value.next != 0) {
+ if(env->tenv.normalize_attrs && attr->value.next != 0) {
err = p_normalize_attr_value(&tmpText, &env->event_buffer, &attr->value, env);
if(err != 0) return err;
@@ -1367,11 +1365,11 @@
}
env->event.value.ptr = env->event_buffer.cursor;
- err = FAXPP_buffer_append_ch(&env->event_buffer, env->encode, ch);
+ err = FAXPP_buffer_append_ch(&env->event_buffer, env->tenv.encode, ch);
env->event.value.len = env->event_buffer.cursor - env->event.value.ptr;
if(env->null_terminate && err == 0)
- err = FAXPP_buffer_append_ch(&env->event_buffer, env->encode, 0);
+ err = FAXPP_buffer_append_ch(&env->event_buffer, env->tenv.encode, 0);
if(err != 0) return err;
break;
@@ -1384,11 +1382,11 @@
}
env->event.value.ptr = env->event_buffer.cursor;
- err = FAXPP_buffer_append_ch(&env->event_buffer, env->encode, ch);
+ err = FAXPP_buffer_append_ch(&env->event_buffer, env->tenv.encode, ch);
env->event.value.len = env->event_buffer.cursor - env->event.value.ptr;
if(env->null_terminate && err == 0)
- err = FAXPP_buffer_append_ch(&env->event_buffer, env->encode, 0);
+ err = FAXPP_buffer_append_ch(&env->event_buffer, env->tenv.encode, 0);
if(err != 0) return err;
break;
Modified: trunk/faxpp/src/xml_parser.h
===================================================================
--- trunk/faxpp/src/xml_parser.h 2007-08-15 00:33:19 UTC (rev 18)
+++ trunk/faxpp/src/xml_parser.h 2007-08-19 22:55:48 UTC (rev 19)
@@ -55,8 +55,6 @@
FAXPP_NextEvent next_event;
FAXPP_NextEvent main_next_event;
- FAXPP_EncodeFunction encode;
-
FAXPP_ReadCallback read;
void *read_user_data;
@@ -65,7 +63,6 @@
FAXPP_TokenizerEnv tenv;
unsigned int null_terminate:1;
- unsigned int normalize_attrs:1;
unsigned int err_line;
unsigned int err_column;
Modified: trunk/faxpp/src/xml_tokenizer.c
===================================================================
--- trunk/faxpp/src/xml_tokenizer.c 2007-08-15 00:33:19 UTC (rev 18)
+++ trunk/faxpp/src/xml_tokenizer.c 2007-08-19 22:55:48 UTC (rev 19)
@@ -285,7 +285,7 @@
tokenizer->decode = FAXPP_utf16_native_decode;
if(tokenizer->encode == FAXPP_utf16_native_encode)
- tokenizer->encode = 0;
+ tokenizer->do_encode = 0;
tokenizer->start_element_name_state = utf16_start_element_name_state;
tokenizer->element_content_state = utf16_element_content_state;
@@ -294,7 +294,7 @@
tokenizer->decode = FAXPP_utf8_decode;
if(tokenizer->encode == FAXPP_utf8_encode)
- tokenizer->encode = 0;
+ tokenizer->do_encode = 0;
tokenizer->start_element_name_state = utf8_start_element_name_state;
tokenizer->element_content_state = utf8_element_content_state;
@@ -330,9 +330,10 @@
}
FAXPP_Error
-init_tokenizer_internal(FAXPP_TokenizerEnv *env)
+init_tokenizer_internal(FAXPP_TokenizerEnv *env, FAXPP_EncodeFunction encode)
{
memset(env, 0, sizeof(FAXPP_TokenizerEnv));
+ env->encode = encode;
return FAXPP_init_buffer(&env->token_buffer, INITIAL_TOKEN_BUFFER_SIZE, change_token_buffer, env);
}
@@ -343,12 +344,12 @@
}
FAXPP_Tokenizer *
-FAXPP_create_tokenizer()
+FAXPP_create_tokenizer(FAXPP_EncodeFunction encode)
{
FAXPP_TokenizerEnv *result = malloc(sizeof(FAXPP_TokenizerEnv));
if(result == 0) return 0;
- if(init_tokenizer_internal(result) == OUT_OF_MEMORY) {
+ if(init_tokenizer_internal(result, encode) == OUT_OF_MEMORY) {
free(result);
return 0;
}
@@ -364,7 +365,7 @@
}
FAXPP_Error
-FAXPP_init_tokenize(FAXPP_Tokenizer *env, void *buffer, unsigned int length, unsigned int done, FAXPP_EncodeFunction encode)
+FAXPP_init_tokenize(FAXPP_Tokenizer *env, void *buffer, unsigned int length, unsigned int done)
{
env->buffer = buffer;
env->buffer_end = buffer + length;
@@ -377,11 +378,11 @@
env->column = 0;
env->nesting_level = 0;
+ env->do_encode = 1;
env->seen_doc_element = 0;
env->buffer_done = done;
env->decode = 0;
- env->encode = encode;
env->token_buffer.cursor = 0;
Modified: trunk/faxpp/src/xml_tokenizer.h
===================================================================
--- trunk/faxpp/src/xml_tokenizer.h 2007-08-15 00:33:19 UTC (rev 18)
+++ trunk/faxpp/src/xml_tokenizer.h 2007-08-19 22:55:48 UTC (rev 19)
@@ -42,8 +42,10 @@
unsigned int column;
unsigned int nesting_level;
+ unsigned int do_encode:1;
unsigned int seen_doc_element:1;
unsigned int buffer_done:1;
+ unsigned int normalize_attrs:1;
FAXPP_DecodeFunction decode;
FAXPP_EncodeFunction encode;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|