[Faxpp-devel] SF.net SVN: faxpp: [33] trunk/faxpp
Status: Beta
Brought to you by:
jpcs
From: <jp...@us...> - 2007-09-28 00:30:06
|
Revision: 33 http://faxpp.svn.sourceforge.net/faxpp/?rev=33&view=rev Author: jpcs Date: 2007-09-27 17:29:16 -0700 (Thu, 27 Sep 2007) Log Message: ----------- Wrote tokenizer states to parse a "PUBLIC" external ID, and to allow comments and PIs in the internal subset. Modified Paths: -------------- trunk/faxpp/include/faxpp/error.h trunk/faxpp/include/faxpp/token.h trunk/faxpp/src/doctype.c trunk/faxpp/src/error.c trunk/faxpp/src/token.c trunk/faxpp/src/tokenizer_states.c trunk/faxpp/src/tokenizer_states.h trunk/faxpp/src/xml_parser.c trunk/faxpp/src/xml_tokenizer.c trunk/faxpp/src/xml_tokenizer.h Modified: trunk/faxpp/include/faxpp/error.h =================================================================== --- trunk/faxpp/include/faxpp/error.h 2007-09-25 11:50:47 UTC (rev 32) +++ trunk/faxpp/include/faxpp/error.h 2007-09-28 00:29:16 UTC (rev 33) @@ -48,6 +48,10 @@ INVALID_CHAR_IN_DOCTYPE_NAME, INVALID_SYSTEM_ID, EXPECTING_SYSTEM_LITERAL, + INVALID_PUBLIC_ID, + INVALID_CHAR_IN_PUBID_LITERAL, + EXPECTING_PUBID_LITERAL, + INVALID_DTD_DECL, OUT_OF_MEMORY, ELEMENT_NAME_MISMATCH, Modified: trunk/faxpp/include/faxpp/token.h =================================================================== --- trunk/faxpp/include/faxpp/token.h 2007-09-25 11:50:47 UTC (rev 32) +++ trunk/faxpp/include/faxpp/token.h 2007-09-28 00:29:16 UTC (rev 33) @@ -69,6 +69,7 @@ DOCTYPE_END_TOKEN, SYSTEM_LITERAL_TOKEN, + PUBID_LITERAL_TOKEN, END_OF_BUFFER_TOKEN = 99 } FAXPP_TokenType; Modified: trunk/faxpp/src/doctype.c =================================================================== --- trunk/faxpp/src/doctype.c 2007-09-25 11:50:47 UTC (rev 32) +++ trunk/faxpp/src/doctype.c 2007-09-28 00:29:16 UTC (rev 33) @@ -100,7 +100,8 @@ next_char(env); return NO_ERROR; case '[': - env->state = doctype_internal_subset_state; + env->state = internal_subset_state; + env->in_internal_subset = 1; token_end_position(env); report_token(DOCTYPE_NAME_TOKEN, env); next_char(env); @@ -163,7 +164,8 @@ next_char(env); return NO_ERROR; case '[': - env->state = doctype_internal_subset_state; + env->state = internal_subset_state; + env->in_internal_subset = 1; token_end_position(env); report_token(DOCTYPE_NAME_TOKEN, env); next_char(env); @@ -199,12 +201,13 @@ env->stored_state = doctype_internal_subset_start_state; env->state = system_id_initial_state1; break; -/* case 'P': */ -/* env->stored_state = doctype_internal_subset_start_state; */ -/* env->state = public_id_initial_state1; */ -/* break; */ + case 'P': + env->stored_state = doctype_internal_subset_start_state; + env->state = public_id_initial_state1; + break; case '[': - env->state = doctype_internal_subset_state; + env->state = internal_subset_state; + env->in_internal_subset = 1; break; case '>': base_state(env); @@ -220,85 +223,123 @@ return NO_ERROR; } +SINGLE_CHAR_STATE(system_id_initial_state1, 'Y', 0, system_id_initial_state2, INVALID_SYSTEM_ID) +SINGLE_CHAR_STATE(system_id_initial_state2, 'S', 0, system_id_initial_state3, INVALID_SYSTEM_ID) +SINGLE_CHAR_STATE(system_id_initial_state3, 'T', 0, system_id_initial_state4, INVALID_SYSTEM_ID) +SINGLE_CHAR_STATE(system_id_initial_state4, 'E', 0, system_id_initial_state5, INVALID_SYSTEM_ID) +SINGLE_CHAR_STATE(system_id_initial_state5, 'M', 0, system_id_ws_state, INVALID_SYSTEM_ID) + FAXPP_Error -doctype_internal_subset_start_state(FAXPP_TokenizerEnv *env) +system_id_ws_state(FAXPP_TokenizerEnv *env) { read_char(env); switch(env->current_char) { WHITESPACE: + env->state = system_literal_start_state; + next_char(env); break; - case '[': - env->state = doctype_internal_subset_state; - break; - case '>': - base_state(env); - report_empty_token(DOCTYPE_END_TOKEN, env); - next_char(env); - token_start_position(env); - return NO_ERROR; default: next_char(env); - return INVALID_DOCTYPE_DECL; + return EXPECTING_WHITESPACE; } - next_char(env); return NO_ERROR; } FAXPP_Error -doctype_internal_subset_state(FAXPP_TokenizerEnv *env) +system_literal_start_state(FAXPP_TokenizerEnv *env) { read_char(env); switch(env->current_char) { - case ']': - env->state = doctype_end_state; + WHITESPACE: next_char(env); return NO_ERROR; - LINE_ENDINGS + case '"': + env->state = system_literal_quot_state; + break; + case '\'': + env->state = system_literal_apos_state; + break; default: next_char(env); - return INVALID_DOCTYPE_DECL; + return EXPECTING_SYSTEM_LITERAL; } + next_char(env); + token_start_position(env); return NO_ERROR; } FAXPP_Error -doctype_end_state(FAXPP_TokenizerEnv *env) +system_literal_apos_state(FAXPP_TokenizerEnv *env) { - read_char(env); + while(1) { + read_char(env); - switch(env->current_char) { - WHITESPACE: + switch(env->current_char) { + case '\'': + retrieve_state(env); + token_end_position(env); + report_token(SYSTEM_LITERAL_TOKEN, env); + next_char(env); + return NO_ERROR; + LINE_ENDINGS + default: + if((FAXPP_char_flags(env->current_char) & env->non_restricted_char) == 0) { + next_char(env); + return RESTRICTED_CHAR; + } + break; + } next_char(env); - break; - case '>': - base_state(env); - report_empty_token(DOCTYPE_END_TOKEN, env); + } + + // Never happens + return NO_ERROR; +} + +FAXPP_Error +system_literal_quot_state(FAXPP_TokenizerEnv *env) +{ + while(1) { + read_char(env); + + switch(env->current_char) { + case '"': + retrieve_state(env); + token_end_position(env); + report_token(SYSTEM_LITERAL_TOKEN, env); + next_char(env); + return NO_ERROR; + LINE_ENDINGS + default: + if((FAXPP_char_flags(env->current_char) & env->non_restricted_char) == 0) { + next_char(env); + return RESTRICTED_CHAR; + } + break; + } next_char(env); - token_start_position(env); - break; - default: - next_char(env); - return INVALID_DOCTYPE_DECL; } + + // Never happens return NO_ERROR; } -SINGLE_CHAR_STATE(system_id_initial_state1, 'Y', 0, system_id_initial_state2, INVALID_SYSTEM_ID) -SINGLE_CHAR_STATE(system_id_initial_state2, 'S', 0, system_id_initial_state3, INVALID_SYSTEM_ID) -SINGLE_CHAR_STATE(system_id_initial_state3, 'T', 0, system_id_initial_state4, INVALID_SYSTEM_ID) -SINGLE_CHAR_STATE(system_id_initial_state4, 'E', 0, system_id_initial_state5, INVALID_SYSTEM_ID) -SINGLE_CHAR_STATE(system_id_initial_state5, 'M', 0, system_id_ws_state, INVALID_SYSTEM_ID) +SINGLE_CHAR_STATE(public_id_initial_state1, 'U', 0, public_id_initial_state2, INVALID_PUBLIC_ID) +SINGLE_CHAR_STATE(public_id_initial_state2, 'B', 0, public_id_initial_state3, INVALID_PUBLIC_ID) +SINGLE_CHAR_STATE(public_id_initial_state3, 'L', 0, public_id_initial_state4, INVALID_PUBLIC_ID) +SINGLE_CHAR_STATE(public_id_initial_state4, 'I', 0, public_id_initial_state5, INVALID_PUBLIC_ID) +SINGLE_CHAR_STATE(public_id_initial_state5, 'C', 0, public_id_ws_state, INVALID_PUBLIC_ID) FAXPP_Error -system_id_ws_state(FAXPP_TokenizerEnv *env) +public_id_ws_state(FAXPP_TokenizerEnv *env) { read_char(env); switch(env->current_char) { WHITESPACE: - env->state = system_literal_start_state; + env->state = pubid_literal_start_state; next_char(env); break; default: @@ -309,7 +350,7 @@ } FAXPP_Error -system_literal_start_state(FAXPP_TokenizerEnv *env) +pubid_literal_start_state(FAXPP_TokenizerEnv *env) { read_char(env); @@ -318,14 +359,14 @@ next_char(env); return NO_ERROR; case '"': - env->state = system_literal_quot_state; + env->state = pubid_literal_quot_state; break; case '\'': - env->state = system_literal_apos_state; + env->state = pubid_literal_apos_state; break; default: next_char(env); - return EXPECTING_SYSTEM_LITERAL; + return EXPECTING_PUBID_LITERAL; } next_char(env); token_start_position(env); @@ -333,25 +374,38 @@ } FAXPP_Error -system_literal_apos_state(FAXPP_TokenizerEnv *env) +pubid_literal_apos_state(FAXPP_TokenizerEnv *env) { while(1) { read_char(env); switch(env->current_char) { case '\'': - retrieve_state(env); + env->state = system_id_ws_state; token_end_position(env); - report_token(SYSTEM_LITERAL_TOKEN, env); + report_token(PUBID_LITERAL_TOKEN, env); next_char(env); return NO_ERROR; + // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] LINE_ENDINGS + // A-Z + case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47: + case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: case 0x4D: case 0x4E: case 0x4F: + case 0x50: case 0x51: case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: case 0x57: + case 0x58: case 0x59: case 0x5A: + // a-z + case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67: + case 0x68: case 0x69: case 0x6A: case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F: + case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77: + case 0x78: case 0x79: case 0x7A: + case ' ': case '-': case '(': case ')': case '+': case ',': case '.': case '/': case ':': + case '=': case '?': case ';': case '!': case '*': case '#': case '@': case '$': case '_': + case '%': + // Valid PubidChar + break; default: - if((FAXPP_char_flags(env->current_char) & env->non_restricted_char) == 0) { - next_char(env); - return RESTRICTED_CHAR; - } - break; + next_char(env); + return INVALID_CHAR_IN_PUBID_LITERAL; } next_char(env); } @@ -361,25 +415,38 @@ } FAXPP_Error -system_literal_quot_state(FAXPP_TokenizerEnv *env) +pubid_literal_quot_state(FAXPP_TokenizerEnv *env) { while(1) { read_char(env); switch(env->current_char) { case '"': - retrieve_state(env); + env->state = system_id_ws_state; token_end_position(env); - report_token(SYSTEM_LITERAL_TOKEN, env); + report_token(PUBID_LITERAL_TOKEN, env); next_char(env); return NO_ERROR; + // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] LINE_ENDINGS + // A-Z + case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47: + case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: case 0x4D: case 0x4E: case 0x4F: + case 0x50: case 0x51: case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: case 0x57: + case 0x58: case 0x59: case 0x5A: + // a-z + case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67: + case 0x68: case 0x69: case 0x6A: case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F: + case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77: + case 0x78: case 0x79: case 0x7A: + case ' ': case '-': case '(': case ')': case '+': case ',': case '.': case '/': case ':': + case '=': case '?': case ';': case '!': case '*': case '#': case '@': case '$': case '_': + case '%': case '\'': + // Valid PubidChar + break; default: - if((FAXPP_char_flags(env->current_char) & env->non_restricted_char) == 0) { - next_char(env); - return RESTRICTED_CHAR; - } - break; + next_char(env); + return INVALID_CHAR_IN_PUBID_LITERAL; } next_char(env); } @@ -388,3 +455,99 @@ return NO_ERROR; } +FAXPP_Error +doctype_internal_subset_start_state(FAXPP_TokenizerEnv *env) +{ + read_char(env); + + switch(env->current_char) { + WHITESPACE: + break; + case '[': + env->state = internal_subset_state; + env->in_internal_subset = 1; + break; + case '>': + base_state(env); + report_empty_token(DOCTYPE_END_TOKEN, env); + next_char(env); + token_start_position(env); + return NO_ERROR; + default: + next_char(env); + return INVALID_DOCTYPE_DECL; + } + next_char(env); + return NO_ERROR; +} + +FAXPP_Error +internal_subset_state(FAXPP_TokenizerEnv *env) +{ + read_char(env); + + switch(env->current_char) { + case ']': + env->state = doctype_end_state; + env->in_internal_subset = 0; + break; + WHITESPACE: + break; + case '<': + env->state = internal_subset_markup_state; + break; + default: + next_char(env); + return INVALID_DOCTYPE_DECL; + } + + next_char(env); + return NO_ERROR; +} + +FAXPP_Error +internal_subset_markup_state(FAXPP_TokenizerEnv *env) +{ + read_char(env); + + switch(env->current_char) { + case '?': + env->state = pi_name_start_state; + break; + case '!': + env->state = comment_start_state1; + break; + LINE_ENDINGS + default: +/* env->state = internal_subset_decl_state; */ + next_char(env); + return INVALID_DTD_DECL; + } + + next_char(env); + token_start_position(env); + return NO_ERROR; +} + +FAXPP_Error +doctype_end_state(FAXPP_TokenizerEnv *env) +{ + read_char(env); + + switch(env->current_char) { + WHITESPACE: + next_char(env); + break; + case '>': + base_state(env); + report_empty_token(DOCTYPE_END_TOKEN, env); + next_char(env); + token_start_position(env); + break; + default: + next_char(env); + return INVALID_DOCTYPE_DECL; + } + return NO_ERROR; +} + Modified: trunk/faxpp/src/error.c =================================================================== --- trunk/faxpp/src/error.c 2007-09-25 11:50:47 UTC (rev 32) +++ trunk/faxpp/src/error.c 2007-09-28 00:29:16 UTC (rev 33) @@ -91,6 +91,14 @@ return "INVALID_SYSTEM_ID"; case EXPECTING_SYSTEM_LITERAL: return "EXPECTING_SYSTEM_LITERAL"; + case INVALID_PUBLIC_ID: + return "INVALID_PUBLIC_ID"; + case INVALID_CHAR_IN_PUBID_LITERAL: + return "INVALID_CHAR_IN_PUBID_LITERAL"; + case EXPECTING_PUBID_LITERAL: + return "EXPECTING_PUBID_LITERAL"; + case INVALID_DTD_DECL: + return "INVALID_DTD_DECL"; case NO_ERROR: break; } Modified: trunk/faxpp/src/token.c =================================================================== --- trunk/faxpp/src/token.c 2007-09-25 11:50:47 UTC (rev 32) +++ trunk/faxpp/src/token.c 2007-09-28 00:29:16 UTC (rev 33) @@ -92,6 +92,8 @@ case SYSTEM_LITERAL_TOKEN: return "SYSTEM_LITERAL_TOKEN"; + case PUBID_LITERAL_TOKEN: + return "PUBID_LITERAL_TOKEN"; case NO_TOKEN: break; Modified: trunk/faxpp/src/tokenizer_states.c =================================================================== --- trunk/faxpp/src/tokenizer_states.c 2007-09-25 11:50:47 UTC (rev 32) +++ trunk/faxpp/src/tokenizer_states.c 2007-09-28 00:29:16 UTC (rev 33) @@ -411,8 +411,10 @@ return "doctype_after_name_state"; else if(state == doctype_internal_subset_start_state) return "doctype_internal_subset_start_state"; - else if(state == doctype_internal_subset_state) - return "doctype_internal_subset_state"; + else if(state == internal_subset_state) + return "internal_subset_state"; + else if(state == internal_subset_markup_state) + return "internal_subset_markup_state"; else if(state == doctype_end_state) return "doctype_end_state"; @@ -435,6 +437,25 @@ else if(state == system_literal_quot_state) return "system_literal_quot_state"; + else if(state == public_id_initial_state1) + return "public_id_initial_state1"; + else if(state == public_id_initial_state2) + return "public_id_initial_state2"; + else if(state == public_id_initial_state3) + return "public_id_initial_state3"; + else if(state == public_id_initial_state4) + return "public_id_initial_state4"; + else if(state == public_id_initial_state5) + return "public_id_initial_state5"; + else if(state == public_id_ws_state) + return "public_id_ws_state"; + else if(state == pubid_literal_start_state) + return "pubid_literal_start_state"; + else if(state == pubid_literal_apos_state) + return "pubid_literal_apos_state"; + else if(state == pubid_literal_quot_state) + return "pubid_literal_quot_state"; + return "unknown"; } #endif Modified: trunk/faxpp/src/tokenizer_states.h =================================================================== --- trunk/faxpp/src/tokenizer_states.h 2007-09-25 11:50:47 UTC (rev 32) +++ trunk/faxpp/src/tokenizer_states.h 2007-09-28 00:29:16 UTC (rev 33) @@ -253,7 +253,8 @@ FAXPP_Error doctype_name_seen_colon_state2(FAXPP_TokenizerEnv *env); FAXPP_Error doctype_after_name_state(FAXPP_TokenizerEnv *env); FAXPP_Error doctype_internal_subset_start_state(FAXPP_TokenizerEnv *env); -FAXPP_Error doctype_internal_subset_state(FAXPP_TokenizerEnv *env); +FAXPP_Error internal_subset_state(FAXPP_TokenizerEnv *env); +FAXPP_Error internal_subset_markup_state(FAXPP_TokenizerEnv *env); FAXPP_Error doctype_end_state(FAXPP_TokenizerEnv *env); FAXPP_Error system_id_initial_state1(FAXPP_TokenizerEnv *env); @@ -266,7 +267,17 @@ FAXPP_Error system_literal_apos_state(FAXPP_TokenizerEnv *env); FAXPP_Error system_literal_quot_state(FAXPP_TokenizerEnv *env); +FAXPP_Error public_id_initial_state1(FAXPP_TokenizerEnv *env); +FAXPP_Error public_id_initial_state2(FAXPP_TokenizerEnv *env); +FAXPP_Error public_id_initial_state3(FAXPP_TokenizerEnv *env); +FAXPP_Error public_id_initial_state4(FAXPP_TokenizerEnv *env); +FAXPP_Error public_id_initial_state5(FAXPP_TokenizerEnv *env); +FAXPP_Error public_id_ws_state(FAXPP_TokenizerEnv *env); +FAXPP_Error pubid_literal_start_state(FAXPP_TokenizerEnv *env); +FAXPP_Error pubid_literal_apos_state(FAXPP_TokenizerEnv *env); +FAXPP_Error pubid_literal_quot_state(FAXPP_TokenizerEnv *env); + /********************* * * Tokenizer Helper Functions @@ -340,6 +351,8 @@ if((env)->nesting_level == 0) \ if((env)->seen_doc_element) \ (env)->state = final_state; \ + else if((env)->in_internal_subset) \ + (env)->state = internal_subset_state; \ else (env)->state = initial_misc_state; \ else (env)->state = (env)->element_content_state; \ } Modified: trunk/faxpp/src/xml_parser.c =================================================================== --- trunk/faxpp/src/xml_parser.c 2007-09-25 11:50:47 UTC (rev 32) +++ trunk/faxpp/src/xml_parser.c 2007-09-28 00:29:16 UTC (rev 33) @@ -814,6 +814,7 @@ case DOCTYPE_NAME_TOKEN: case DOCTYPE_END_TOKEN: case SYSTEM_LITERAL_TOKEN: + case PUBID_LITERAL_TOKEN: // TBD - jpcs break; Modified: trunk/faxpp/src/xml_tokenizer.c =================================================================== --- trunk/faxpp/src/xml_tokenizer.c 2007-09-25 11:50:47 UTC (rev 32) +++ trunk/faxpp/src/xml_tokenizer.c 2007-09-28 00:29:16 UTC (rev 33) @@ -389,6 +389,7 @@ env->nesting_level = 0; env->do_encode = 1; env->seen_doctype = 0; + env->in_internal_subset = 0; env->seen_doc_element = 0; env->buffer_done = done; Modified: trunk/faxpp/src/xml_tokenizer.h =================================================================== --- trunk/faxpp/src/xml_tokenizer.h 2007-09-25 11:50:47 UTC (rev 32) +++ trunk/faxpp/src/xml_tokenizer.h 2007-09-28 00:29:16 UTC (rev 33) @@ -44,6 +44,7 @@ unsigned int nesting_level; unsigned int do_encode:1; unsigned int seen_doctype:1; + unsigned int in_internal_subset:1; unsigned int seen_doc_element:1; unsigned int buffer_done:1; unsigned int normalize_attrs:1; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |