[Faxpp-devel] SF.net SVN: faxpp: [44] trunk/faxpp
Status: Beta
Brought to you by:
jpcs
From: <jp...@us...> - 2008-03-15 10:59:43
|
Revision: 44 http://faxpp.svn.sourceforge.net/faxpp/?rev=44&view=rev Author: jpcs Date: 2008-03-15 03:59:42 -0700 (Sat, 15 Mar 2008) Log Message: ----------- Started work on parsing and expanding parameter entity references in DTD markup. Modified Paths: -------------- trunk/faxpp/examples/parser_example.c trunk/faxpp/include/faxpp/error.h trunk/faxpp/include/faxpp/parser.h trunk/faxpp/include/faxpp/token.h trunk/faxpp/src/conditional.c trunk/faxpp/src/doctype.c trunk/faxpp/src/elementdecl.c trunk/faxpp/src/error.c trunk/faxpp/src/reference.c trunk/faxpp/src/token.c trunk/faxpp/src/tokenizer_states.c trunk/faxpp/src/tokenizer_states.h trunk/faxpp/src/xml_parser.c trunk/faxpp/src/xml_tokenizer.c trunk/faxpp/src/xml_tokenizer.h trunk/faxpp/src/xmldecl.c Modified: trunk/faxpp/examples/parser_example.c =================================================================== --- trunk/faxpp/examples/parser_example.c 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/examples/parser_example.c 2008-03-15 10:59:42 UTC (rev 44) @@ -76,7 +76,7 @@ } while((err = FAXPP_next_event(parser)) == 0) { -/* output_event(FAXPP_get_current_event(parser), stdout); */ + output_event(FAXPP_get_current_event(parser), stdout); if(FAXPP_get_current_event(parser)->type == END_DOCUMENT_EVENT) break; Modified: trunk/faxpp/include/faxpp/error.h =================================================================== --- trunk/faxpp/include/faxpp/error.h 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/include/faxpp/error.h 2008-03-15 10:59:42 UTC (rev 44) @@ -69,6 +69,7 @@ INVALID_DEFAULTDECL, INVALID_ELEMENTDECL_CONTENT, INVALID_CONDITIONAL_SECTION, + IMPROPER_NESTING_OF_ENTITY, OUT_OF_MEMORY, ELEMENT_NAME_MISMATCH, Modified: trunk/faxpp/include/faxpp/parser.h =================================================================== --- trunk/faxpp/include/faxpp/parser.h 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/include/faxpp/parser.h 2008-03-15 10:59:42 UTC (rev 44) @@ -92,8 +92,9 @@ /// The type of external entity to parse typedef enum { - EXTERNAL_PARSED_ENTITY = 0, ///< An external parsed entity - EXTERNAL_SUBSET_ENTITY = 1 ///< An external subset (DTD) + EXTERNAL_PARSED_ENTITY = 0, ///< An external parsed entity + EXTERNAL_SUBSET_ENTITY = 1, ///< An external subset (DTD) + EXTERNAL_IN_MARKUP_ENTITY = 2 ///< An external entity inside DTD markup } FAXPP_EntityType; /** Modified: trunk/faxpp/include/faxpp/token.h =================================================================== --- trunk/faxpp/include/faxpp/token.h 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/include/faxpp/token.h 2008-03-15 10:59:42 UTC (rev 44) @@ -64,6 +64,7 @@ DEC_CHAR_REFERENCE_TOKEN, HEX_CHAR_REFERENCE_TOKEN, PE_REFERENCE_TOKEN, + PE_REFERENCE_IN_MARKUP_TOKEN, DOCTYPE_PREFIX_TOKEN, DOCTYPE_NAME_TOKEN, Modified: trunk/faxpp/src/conditional.c =================================================================== --- trunk/faxpp/src/conditional.c 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/src/conditional.c 2008-03-15 10:59:42 UTC (rev 44) @@ -37,6 +37,29 @@ return NO_ERROR; \ } +FAXPP_Error +conditional_ws_state(FAXPP_TokenizerEnv *env) +{ + read_char(env); + + switch(env->current_char) { + WHITESPACE: + next_char(env); + break; + case '%': + store_state(env); + env->state = parameter_entity_reference_in_markup_state; + next_char(env); + token_start_position(env); + return NO_ERROR; + default: + env->state = conditional_state1; + // No next_char + break; + } + return NO_ERROR; +} + SINGLE_CHAR_STATE(conditional_state1, 'I', 0, conditional_state2, INVALID_CONDITIONAL_SECTION) FAXPP_Error @@ -47,11 +70,9 @@ switch(env->current_char) { case 'N': env->state = include_state1; - env->nesting_level += 1; break; case 'G': env->state = ignore_state1; - env->nesting_level += 1; break; LINE_ENDINGS default: @@ -66,9 +87,28 @@ SINGLE_CHAR_STATE(ignore_state2, 'O', 0, ignore_state3, INVALID_CONDITIONAL_SECTION) SINGLE_CHAR_STATE(ignore_state3, 'R', 0, ignore_state4, INVALID_CONDITIONAL_SECTION) SINGLE_CHAR_STATE(ignore_state4, 'E', ignore_state5, ws_state, INVALID_CONDITIONAL_SECTION) -SINGLE_CHAR_STATE(ignore_state5, '[', 0, ignore_content_state, INVALID_CONDITIONAL_SECTION) FAXPP_Error +ignore_state5(FAXPP_TokenizerEnv *env) +{ + read_char(env); + + switch(env->current_char) { + case '[': + env->ignore_start_level = env->nesting_level; + env->nesting_level += 1; + env->state = ignore_content_state; + next_char(env); + break; + LINE_ENDINGS + default: + next_char(env); + return INVALID_CONDITIONAL_SECTION; + } + return NO_ERROR; +} + +FAXPP_Error ignore_content_state(FAXPP_TokenizerEnv *env) { read_char(env); @@ -169,7 +209,8 @@ break; case '>': env->nesting_level -= 1; - if(env->nesting_level == 0) { + if(env->nesting_level == env->ignore_start_level) { + env->nesting_level -= 1; base_state(env); } else { @@ -194,4 +235,23 @@ SINGLE_CHAR_STATE(include_state3, 'U', 0, include_state4, INVALID_CONDITIONAL_SECTION) SINGLE_CHAR_STATE(include_state4, 'D', 0, include_state5, INVALID_CONDITIONAL_SECTION) SINGLE_CHAR_STATE(include_state5, 'E', include_state6, ws_state, INVALID_CONDITIONAL_SECTION) -SINGLE_CHAR_STATE(include_state6, '[', 0, external_subset_state, INVALID_CONDITIONAL_SECTION) + +FAXPP_Error +include_state6(FAXPP_TokenizerEnv *env) +{ + read_char(env); + + switch(env->current_char) { + case '[': + env->nesting_level += 1; + env->state = external_subset_state; + next_char(env); + break; + LINE_ENDINGS + default: + next_char(env); + return INVALID_CONDITIONAL_SECTION; + } + return NO_ERROR; +} + Modified: trunk/faxpp/src/doctype.c =================================================================== --- trunk/faxpp/src/doctype.c 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/src/doctype.c 2008-03-15 10:59:42 UTC (rev 44) @@ -726,7 +726,7 @@ next_char(env); return INVALID_DTD_DECL; case '>': - env->nesting_level -= 1; + env->nesting_level -= 2; base_state(env); break; default: @@ -772,8 +772,8 @@ env->state = comment_start_state2; break; case '[': - env->stored_state = conditional_state1; - env->state = ws_state; + env->nesting_level += 1; + env->state = conditional_ws_state; break; case 'E': env->state = elementdecl_or_entitydecl_state; Modified: trunk/faxpp/src/elementdecl.c =================================================================== --- trunk/faxpp/src/elementdecl.c 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/src/elementdecl.c 2008-03-15 10:59:42 UTC (rev 44) @@ -24,6 +24,7 @@ switch(env->current_char) { case 'L': + env->nesting_level += 1; env->state = elementdecl_initial_state1; next_char(env); break; @@ -66,9 +67,34 @@ SINGLE_CHAR_STATE(elementdecl_initial_state2, 'M', 0, elementdecl_initial_state3, INVALID_DTD_DECL) SINGLE_CHAR_STATE(elementdecl_initial_state3, 'E', 0, elementdecl_initial_state4, INVALID_DTD_DECL) SINGLE_CHAR_STATE(elementdecl_initial_state4, 'N', 0, elementdecl_initial_state5, INVALID_DTD_DECL) -SINGLE_CHAR_STATE(elementdecl_initial_state5, 'T', elementdecl_name_state1, ws_plus_state, INVALID_DTD_DECL) +SINGLE_CHAR_STATE(elementdecl_initial_state5, 'T', 0, elementdecl_name_ws_state, INVALID_DTD_DECL) FAXPP_Error +elementdecl_name_ws_state(FAXPP_TokenizerEnv *env) +{ + read_char(env); + + switch(env->current_char) { + WHITESPACE: + next_char(env); + break; + case '%': + // TBD only for external subset - jpcs + store_state(env); + env->state = parameter_entity_reference_in_markup_state; + next_char(env); + token_start_position(env); + return NO_ERROR; + default: + env->state = elementdecl_name_state1; + token_start_position(env); + // No next_char + break; + } + return NO_ERROR; +} + +FAXPP_Error elementdecl_name_state1(FAXPP_TokenizerEnv *env) { read_char(env); @@ -624,6 +650,7 @@ switch(env->current_char) { case '>': + env->nesting_level -= 1; base_state(env); report_empty_token(ELEMENTDECL_END_TOKEN, env); break; Modified: trunk/faxpp/src/error.c =================================================================== --- trunk/faxpp/src/error.c 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/src/error.c 2008-03-15 10:59:42 UTC (rev 44) @@ -133,6 +133,8 @@ return "INVALID_ELEMENTDECL_CONTENT"; case INVALID_CONDITIONAL_SECTION: return "INVALID_CONDITIONAL_SECTION"; + case IMPROPER_NESTING_OF_ENTITY: + return "IMPROPER_NESTING_OF_ENTITY"; case NO_ERROR: break; } Modified: trunk/faxpp/src/reference.c =================================================================== --- trunk/faxpp/src/reference.c 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/src/reference.c 2008-03-15 10:59:42 UTC (rev 44) @@ -359,6 +359,33 @@ } FAXPP_Error +parameter_entity_reference_in_markup_state(FAXPP_TokenizerEnv *env) +{ + while(1) { + read_char(env); + + switch(env->current_char) { + LINE_ENDINGS + break; + case ';': + retrieve_state(env); + token_end_position(env); + report_token(PE_REFERENCE_IN_MARKUP_TOKEN, env); + next_char(env); + token_start_position(env); + return NO_ERROR; + } + + next_char(env); + if((FAXPP_char_flags(env->current_char) & env->ncname_char) == 0) + return INVALID_CHAR_IN_ENTITY_REFERENCE; + } + + // Never happens + return NO_ERROR; +} + +FAXPP_Error char_reference_state(FAXPP_TokenizerEnv *env) { read_char(env); Modified: trunk/faxpp/src/token.c =================================================================== --- trunk/faxpp/src/token.c 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/src/token.c 2008-03-15 10:59:42 UTC (rev 44) @@ -54,6 +54,8 @@ return "ENTITY_REFERENCE_TOKEN"; case PE_REFERENCE_TOKEN: return "PE_REFERENCE_TOKEN"; + case PE_REFERENCE_IN_MARKUP_TOKEN: + return "PE_REFERENCE_IN_MARKUP_TOKEN"; case DEC_CHAR_REFERENCE_TOKEN: return "DEC_CHAR_REFERENCE_TOKEN"; case HEX_CHAR_REFERENCE_TOKEN: Modified: trunk/faxpp/src/tokenizer_states.c =================================================================== --- trunk/faxpp/src/tokenizer_states.c 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/src/tokenizer_states.c 2008-03-15 10:59:42 UTC (rev 44) @@ -256,6 +256,8 @@ return "entity_reference_state"; else if(state == parameter_entity_reference_state) return "parameter_entity_reference_state"; + else if(state == parameter_entity_reference_in_markup_state) + return "parameter_entity_reference_in_markup_state"; else if(state == char_reference_state) return "char_reference_state"; else if(state == dec_char_reference_state) @@ -498,6 +500,8 @@ return "elementdecl_initial_state4"; else if(state == elementdecl_initial_state5) return "elementdecl_initial_state5"; + else if(state == elementdecl_name_ws_state) + return "elementdecl_name_ws_state"; else if(state == elementdecl_name_state1) return "elementdecl_name_state1"; else if(state == elementdecl_name_state2) @@ -793,6 +797,8 @@ else if(state == paramentitydecl_end_state) return "paramentitydecl_end_state"; + else if(state == conditional_ws_state) + return "conditional_ws_state"; else if(state == conditional_state1) return "conditional_state1"; else if(state == conditional_state2) Modified: trunk/faxpp/src/tokenizer_states.h =================================================================== --- trunk/faxpp/src/tokenizer_states.h 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/src/tokenizer_states.h 2008-03-15 10:59:42 UTC (rev 44) @@ -172,6 +172,7 @@ FAXPP_Error quot_entity_reference_state4(FAXPP_TokenizerEnv *env); FAXPP_Error entity_reference_state(FAXPP_TokenizerEnv *env); FAXPP_Error parameter_entity_reference_state(FAXPP_TokenizerEnv *env); +FAXPP_Error parameter_entity_reference_in_markup_state(FAXPP_TokenizerEnv *env); FAXPP_Error char_reference_state(FAXPP_TokenizerEnv *env); FAXPP_Error dec_char_reference_state(FAXPP_TokenizerEnv *env); FAXPP_Error hex_char_reference_state1(FAXPP_TokenizerEnv *env); @@ -299,6 +300,7 @@ FAXPP_Error elementdecl_initial_state3(FAXPP_TokenizerEnv *env); FAXPP_Error elementdecl_initial_state4(FAXPP_TokenizerEnv *env); FAXPP_Error elementdecl_initial_state5(FAXPP_TokenizerEnv *env); +FAXPP_Error elementdecl_name_ws_state(FAXPP_TokenizerEnv *env); FAXPP_Error elementdecl_name_state1(FAXPP_TokenizerEnv *env); FAXPP_Error elementdecl_name_state2(FAXPP_TokenizerEnv *env); FAXPP_Error elementdecl_name_seen_colon_state1(FAXPP_TokenizerEnv *env); @@ -448,6 +450,7 @@ FAXPP_Error paramentitydecl_value_quot_state(FAXPP_TokenizerEnv *env); FAXPP_Error paramentitydecl_end_state(FAXPP_TokenizerEnv *env); +FAXPP_Error conditional_ws_state(FAXPP_TokenizerEnv *env); FAXPP_Error conditional_state1(FAXPP_TokenizerEnv *env); FAXPP_Error conditional_state2(FAXPP_TokenizerEnv *env); FAXPP_Error ignore_state1(FAXPP_TokenizerEnv *env); @@ -489,8 +492,8 @@ return BAD_ENCODING; \ } \ \ -/* printf("%03d:%03d State: %s, Byte: %c, Char: %08X\n", (env)->line, (env)->column, */ \ -/* FAXPP_state_to_string((env)->state), *(unsigned char*)(env)->position, */ \ +/* printf("%03d:%03d Tok:%p L:%03d State: %s, Byte: %c, Char: %08X\n", (env)->line, (env)->column, */ \ +/* (env), (env)->nesting_level, FAXPP_state_to_string((env)->state), *(unsigned char*)(env)->position, */ \ /* (env)->current_char); */ \ } @@ -560,6 +563,8 @@ (env)->state = final_state; \ else if((env)->internal_subset) \ (env)->state = internal_subset_state; \ + else if((env)->in_markup_entity) \ + (env)->state = (env)->prev->state; \ else (env)->state = initial_misc_state; \ } Modified: trunk/faxpp/src/xml_parser.c =================================================================== --- trunk/faxpp/src/xml_parser.c 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/src/xml_parser.c 2008-03-15 10:59:42 UTC (rev 44) @@ -1069,7 +1069,7 @@ case XML_DECL_END_TOKEN: env->next_event = nc_unsupported_encoding_next_event; - if(env->tenv->external_subset) { + if(env->tenv->external_subset || env->tenv->in_markup_entity) { // TBD event for start of external subset - jpcs next = nc_dtd_next_event; } @@ -1131,7 +1131,7 @@ default: env->tenv->buffered_token = 1; p_reset_event(env); - if(env->tenv->external_subset) { + if(env->tenv->external_subset || env->tenv->in_markup_entity) { // TBD event for start of external subset - jpcs env->next_event = nc_dtd_next_event; } @@ -1201,14 +1201,6 @@ #define p_compare_text(a, b) (((a)->len == (b)->len) ? memcmp((a)->ptr, (b)->ptr, (a)->len) : ((a)->len - (b)->len)) -/* static int p_compare_text(const FAXPP_Text *a, const FAXPP_Text *b) */ -/* { */ -/* int cmp = a->len - b->len; */ -/* if(cmp != 0) return cmp; */ - -/* return memcmp(a->ptr, b->ptr, a->len); */ -/* } */ - static FAXPP_EntityInfo *p_find_entity_info(const FAXPP_Text *name, FAXPP_EntityInfo *list) { while(list) { @@ -1238,9 +1230,6 @@ env->tenv->line = entv->line; env->tenv->column = entv->column; - if(state == EXTERNAL_PARSED_ENTITY) - env->next_event = nc_start_document_next_event; - // Set the entity on the first new tokenizer if(*initial_entity) { env->tenv->start_of_entity = 1; @@ -1282,8 +1271,13 @@ return err; } +static const char single_space[] = {' '}; + static FAXPP_Error p_parse_entity(FAXPP_ParserEnv *env, FAXPP_EntityInfo *ent, FAXPP_EntityParseState state) { + FAXPP_Error err; + FAXPP_EntityInfo *tmp; + // Check for a recursive entity FAXPP_TokenizerEnv *tokenizer = env->tenv; while(tokenizer) { @@ -1293,18 +1287,47 @@ tokenizer = tokenizer->prev; } + if(state == IN_MARKUP_ENTITY || state == EXTERNAL_IN_MARKUP_ENTITY) { + // Add a space after the entity inside DTD markup + err = FAXPP_push_entity_tokenizer(&env->tenv, IN_MARKUP_ENTITY, (void*)single_space, 1, /*done*/1); + if(err) return err; + + env->tenv->line = ent->line; + env->tenv->column = ent->column; + + FAXPP_set_tokenizer_decode(env->tenv, FAXPP_utf8_decode); + } + if(ent->external) { switch(state) { case ELEMENT_CONTENT_ENTITY: state = EXTERNAL_PARSED_ENTITY; break; case INTERNAL_DTD_ENTITY: state = EXTERNAL_SUBSET_ENTITY; break; case EXTERNAL_DTD_ENTITY: state = EXTERNAL_SUBSET_ENTITY; break; + case IN_MARKUP_ENTITY: state = EXTERNAL_IN_MARKUP_ENTITY; break; default: break; } - return p_parse_external_entity(env, ent, state); + err = p_parse_external_entity(env, ent, state); + if(err) return err; } + else { + tmp = ent; + err = p_parse_internal_entity(env, ent, state, &tmp); + if(err) return err; - return p_parse_internal_entity(env, ent, state, &ent); + if(state == IN_MARKUP_ENTITY || state == EXTERNAL_IN_MARKUP_ENTITY) { + // Add a space before the entity inside DTD markup + err = FAXPP_push_entity_tokenizer(&env->tenv, IN_MARKUP_ENTITY, (void*)single_space, 1, /*done*/1); + if(err) return err; + + env->tenv->line = ent->line; + env->tenv->column = ent->column; + + FAXPP_set_tokenizer_decode(env->tenv, FAXPP_utf8_decode); + } + } + + return NO_ERROR; } static Char32 p_dec_char_ref_value(const FAXPP_Text *text, FAXPP_ParserEnv *env) @@ -1563,7 +1586,27 @@ if(err) goto error; } break; + case PE_REFERENCE_IN_MARKUP_TOKEN: + // Parameter entity references cannot be forward references - + // so we go ahead and look them up straight away + ent = p_find_entity_info(&env->tenv->result_token.value, env->parameter_entities); + // [VC: Entity Declared] + if(ent == 0) { + err = UNDEFINED_ENTITY; + goto error; + } + p_set_text_from_text(&bkup_system, &env->event.system_id); + p_set_text_from_text(&bkup_public, &env->event.public_id); + + err = p_parse_entity(env, ent, IN_MARKUP_ENTITY); + + p_set_text_from_text(&env->event.system_id, &bkup_system); + p_set_text_from_text(&env->event.public_id, &bkup_public); + + if(err) goto error; + break; + case ELEMENTDECL_LPAR_TOKEN: cs = (FAXPP_ContentSpec*)malloc(sizeof(FAXPP_ContentSpec)); memset(cs, 0, sizeof(FAXPP_ContentSpec)); @@ -1699,6 +1742,7 @@ case PUBID_LITERAL_TOKEN: case NDATA_NAME_TOKEN: case PE_REFERENCE_TOKEN: + case PE_REFERENCE_IN_MARKUP_TOKEN: case ELEMENTDECL_PREFIX_TOKEN: case ELEMENTDECL_NAME_TOKEN: case ELEMENTDECL_EMPTY_TOKEN: Modified: trunk/faxpp/src/xml_tokenizer.c =================================================================== --- trunk/faxpp/src/xml_tokenizer.c 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/src/xml_tokenizer.c 2008-03-15 10:59:42 UTC (rev 44) @@ -392,6 +392,7 @@ env->nesting_level = 0; env->elemdecl_content_level = 0; + env->ignore_start_level = 0; env->do_encode = 1; env->seen_doctype = 0; @@ -403,6 +404,7 @@ env->internal_dtd_entity = 0; env->external_dtd_entity = 0; env->external_parsed_entity = 0; + env->in_markup_entity = 0; env->start_of_entity = 0; env->start_of_file = 0; @@ -470,6 +472,7 @@ env->external_dtd_entity = state == EXTERNAL_DTD_ENTITY; env->external_parsed_entity = state == EXTERNAL_PARSED_ENTITY; env->external_subset = state == EXTERNAL_SUBSET_ENTITY; + env->in_markup_entity = state == IN_MARKUP_ENTITY || state == EXTERNAL_IN_MARKUP_ENTITY; FAXPP_set_tokenizer_decode(env, env->prev->transcoder.decode); @@ -491,8 +494,12 @@ case EXTERNAL_DTD_ENTITY: env->state = external_subset_state; break; + case IN_MARKUP_ENTITY: + env->state = env->prev->state; + break; case EXTERNAL_PARSED_ENTITY: case EXTERNAL_SUBSET_ENTITY: + case EXTERNAL_IN_MARKUP_ENTITY: env->state = initial_state; break; } @@ -515,7 +522,11 @@ *list = env->prev; if(env->start_of_entity) { - if(env->stored_state != 0 || env->nesting_level != 0 || env->elemdecl_content_level != 0 || + if(env->in_markup_entity) { + if(env->nesting_level != 0 || env->elemdecl_content_level != 0) + err = IMPROPER_NESTING_OF_ENTITY; + } + else if(env->stored_state != 0 || env->nesting_level != 0 || env->elemdecl_content_level != 0 || (env->element_entity && env->state != parsed_entity_state && env->state != default_element_content_rsquare_state1 && env->state != default_element_content_rsquare_state2) || @@ -527,7 +538,8 @@ err = INCOMPLETE_MARKUP_IN_ENTITY_VALUE; } } - else { + + if(env->in_markup_entity || !env->start_of_entity) { // Force the old tokenizer token to point into the token buffer FAXPP_tokenizer_release_buffer(env, 0); Modified: trunk/faxpp/src/xml_tokenizer.h =================================================================== --- trunk/faxpp/src/xml_tokenizer.h 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/src/xml_tokenizer.h 2008-03-15 10:59:42 UTC (rev 44) @@ -52,6 +52,7 @@ unsigned int nesting_level; unsigned int elemdecl_content_level; + unsigned int ignore_start_level; unsigned int do_encode:1; unsigned int buffer_done:1; @@ -64,6 +65,7 @@ unsigned int internal_dtd_entity:1; unsigned int external_dtd_entity:1; unsigned int external_parsed_entity:1; + unsigned int in_markup_entity:1; unsigned int normalize_attrs:1; unsigned int user_provided_decode:1; @@ -99,14 +101,16 @@ struct FAXPP_TokenizerEnv_s *prev; }; -// The first two values are the same as the values in FAXPP_EntityType +// The first three values are the same as the values in FAXPP_EntityType typedef enum { EXTERNAL_PARSED_ENTITY2 = EXTERNAL_PARSED_ENTITY, EXTERNAL_SUBSET_ENTITY2 = EXTERNAL_SUBSET_ENTITY, + EXTERNAL_IN_MARKUP_ENTITY2 = EXTERNAL_IN_MARKUP_ENTITY, ELEMENT_CONTENT_ENTITY, INTERNAL_DTD_ENTITY, EXTERNAL_DTD_ENTITY, + IN_MARKUP_ENTITY, ATTRIBUTE_VALUE_ENTITY } FAXPP_EntityParseState; Modified: trunk/faxpp/src/xmldecl.c =================================================================== --- trunk/faxpp/src/xmldecl.c 2008-03-14 15:24:54 UTC (rev 43) +++ trunk/faxpp/src/xmldecl.c 2008-03-15 10:59:42 UTC (rev 44) @@ -16,6 +16,7 @@ #include "tokenizer_states.h" #include "char_classes.h" +#include "xml_parser.h" FAXPP_Error xml_decl_or_markup_state(FAXPP_TokenizerEnv *env) @@ -29,7 +30,10 @@ token_start_position(env); break; case '!': - if(env->external_subset) + // TBD Do this in all other places where it's not an XMLDecl - jpcs + if(env->in_markup_entity) + return INVALID_DTD_DECL; // TBD is this right? - jpcs + else if(env->external_subset) env->state = external_subset_markup_state; else env->state = initial_markup_state; @@ -164,7 +168,7 @@ env->state = xml_decl_version_state2; break; case 'e': - if(env->external_parsed_entity || env->external_subset) { + if(env->external_parsed_entity || env->external_subset || env->in_markup_entity) { env->state = xml_decl_encoding_state2; break; } @@ -330,13 +334,13 @@ WHITESPACE: break; case '?': - if(env->external_parsed_entity || env->external_subset) goto invalid_char; + if(env->external_parsed_entity || env->external_subset || env->in_markup_entity) goto invalid_char; env->state = xml_decl_seen_question_state; token_start_position(env); break; case 's': - if(env->external_parsed_entity || env->external_subset) goto invalid_char; + if(env->external_parsed_entity || env->external_subset || env->in_markup_entity) goto invalid_char; env->state = xml_decl_standalone_state2; break; @@ -524,7 +528,7 @@ next_char(env); break; case 's': - if(!env->external_parsed_entity && !env->external_subset) { + if(!env->external_parsed_entity && !env->external_subset && !env->in_markup_entity) { env->state = xml_decl_standalone_state2; next_char(env); break; @@ -698,17 +702,42 @@ return NO_ERROR; } +static const char single_space[] = {' '}; + FAXPP_Error xml_decl_seen_question_state(FAXPP_TokenizerEnv *env) { + FAXPP_Error err; + FAXPP_TokenizerEnv *tok; + read_char(env); switch(env->current_char) { case '>': base_state(env); + report_empty_token(XML_DECL_END_TOKEN, env); next_char(env); token_start_position(env); + + if(env->in_markup_entity) { + // Add a space before the entity inside DTD markup + err = FAXPP_push_entity_tokenizer(&env, IN_MARKUP_ENTITY, (void*)single_space, 1, /*done*/1); + if(err) return err; + + tok = env; + while(tok && tok->entity == 0) { + tok = tok->prev; + } + + if(tok) { + env->line = tok->entity->line; + env->column = tok->entity->column; + } + + FAXPP_set_tokenizer_decode(env, FAXPP_utf8_decode); + } + break; LINE_ENDINGS default: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |