[Faxpp-devel] SF.net SVN: faxpp: [33] trunk/faxpp
Status: Beta
Brought to you by:
jpcs
|
From: <jp...@us...> - 2007-09-28 00:30:06
|
Revision: 33
http://faxpp.svn.sourceforge.net/faxpp/?rev=33&view=rev
Author: jpcs
Date: 2007-09-27 17:29:16 -0700 (Thu, 27 Sep 2007)
Log Message:
-----------
Wrote tokenizer states to parse a "PUBLIC" external ID, and to allow
comments and PIs in the internal subset.
Modified Paths:
--------------
trunk/faxpp/include/faxpp/error.h
trunk/faxpp/include/faxpp/token.h
trunk/faxpp/src/doctype.c
trunk/faxpp/src/error.c
trunk/faxpp/src/token.c
trunk/faxpp/src/tokenizer_states.c
trunk/faxpp/src/tokenizer_states.h
trunk/faxpp/src/xml_parser.c
trunk/faxpp/src/xml_tokenizer.c
trunk/faxpp/src/xml_tokenizer.h
Modified: trunk/faxpp/include/faxpp/error.h
===================================================================
--- trunk/faxpp/include/faxpp/error.h 2007-09-25 11:50:47 UTC (rev 32)
+++ trunk/faxpp/include/faxpp/error.h 2007-09-28 00:29:16 UTC (rev 33)
@@ -48,6 +48,10 @@
INVALID_CHAR_IN_DOCTYPE_NAME,
INVALID_SYSTEM_ID,
EXPECTING_SYSTEM_LITERAL,
+ INVALID_PUBLIC_ID,
+ INVALID_CHAR_IN_PUBID_LITERAL,
+ EXPECTING_PUBID_LITERAL,
+ INVALID_DTD_DECL,
OUT_OF_MEMORY,
ELEMENT_NAME_MISMATCH,
Modified: trunk/faxpp/include/faxpp/token.h
===================================================================
--- trunk/faxpp/include/faxpp/token.h 2007-09-25 11:50:47 UTC (rev 32)
+++ trunk/faxpp/include/faxpp/token.h 2007-09-28 00:29:16 UTC (rev 33)
@@ -69,6 +69,7 @@
DOCTYPE_END_TOKEN,
SYSTEM_LITERAL_TOKEN,
+ PUBID_LITERAL_TOKEN,
END_OF_BUFFER_TOKEN = 99
} FAXPP_TokenType;
Modified: trunk/faxpp/src/doctype.c
===================================================================
--- trunk/faxpp/src/doctype.c 2007-09-25 11:50:47 UTC (rev 32)
+++ trunk/faxpp/src/doctype.c 2007-09-28 00:29:16 UTC (rev 33)
@@ -100,7 +100,8 @@
next_char(env);
return NO_ERROR;
case '[':
- env->state = doctype_internal_subset_state;
+ env->state = internal_subset_state;
+ env->in_internal_subset = 1;
token_end_position(env);
report_token(DOCTYPE_NAME_TOKEN, env);
next_char(env);
@@ -163,7 +164,8 @@
next_char(env);
return NO_ERROR;
case '[':
- env->state = doctype_internal_subset_state;
+ env->state = internal_subset_state;
+ env->in_internal_subset = 1;
token_end_position(env);
report_token(DOCTYPE_NAME_TOKEN, env);
next_char(env);
@@ -199,12 +201,13 @@
env->stored_state = doctype_internal_subset_start_state;
env->state = system_id_initial_state1;
break;
-/* case 'P': */
-/* env->stored_state = doctype_internal_subset_start_state; */
-/* env->state = public_id_initial_state1; */
-/* break; */
+ case 'P':
+ env->stored_state = doctype_internal_subset_start_state;
+ env->state = public_id_initial_state1;
+ break;
case '[':
- env->state = doctype_internal_subset_state;
+ env->state = internal_subset_state;
+ env->in_internal_subset = 1;
break;
case '>':
base_state(env);
@@ -220,85 +223,123 @@
return NO_ERROR;
}
+SINGLE_CHAR_STATE(system_id_initial_state1, 'Y', 0, system_id_initial_state2, INVALID_SYSTEM_ID)
+SINGLE_CHAR_STATE(system_id_initial_state2, 'S', 0, system_id_initial_state3, INVALID_SYSTEM_ID)
+SINGLE_CHAR_STATE(system_id_initial_state3, 'T', 0, system_id_initial_state4, INVALID_SYSTEM_ID)
+SINGLE_CHAR_STATE(system_id_initial_state4, 'E', 0, system_id_initial_state5, INVALID_SYSTEM_ID)
+SINGLE_CHAR_STATE(system_id_initial_state5, 'M', 0, system_id_ws_state, INVALID_SYSTEM_ID)
+
FAXPP_Error
-doctype_internal_subset_start_state(FAXPP_TokenizerEnv *env)
+system_id_ws_state(FAXPP_TokenizerEnv *env)
{
read_char(env);
switch(env->current_char) {
WHITESPACE:
+ env->state = system_literal_start_state;
+ next_char(env);
break;
- case '[':
- env->state = doctype_internal_subset_state;
- break;
- case '>':
- base_state(env);
- report_empty_token(DOCTYPE_END_TOKEN, env);
- next_char(env);
- token_start_position(env);
- return NO_ERROR;
default:
next_char(env);
- return INVALID_DOCTYPE_DECL;
+ return EXPECTING_WHITESPACE;
}
- next_char(env);
return NO_ERROR;
}
FAXPP_Error
-doctype_internal_subset_state(FAXPP_TokenizerEnv *env)
+system_literal_start_state(FAXPP_TokenizerEnv *env)
{
read_char(env);
switch(env->current_char) {
- case ']':
- env->state = doctype_end_state;
+ WHITESPACE:
next_char(env);
return NO_ERROR;
- LINE_ENDINGS
+ case '"':
+ env->state = system_literal_quot_state;
+ break;
+ case '\'':
+ env->state = system_literal_apos_state;
+ break;
default:
next_char(env);
- return INVALID_DOCTYPE_DECL;
+ return EXPECTING_SYSTEM_LITERAL;
}
+ next_char(env);
+ token_start_position(env);
return NO_ERROR;
}
FAXPP_Error
-doctype_end_state(FAXPP_TokenizerEnv *env)
+system_literal_apos_state(FAXPP_TokenizerEnv *env)
{
- read_char(env);
+ while(1) {
+ read_char(env);
- switch(env->current_char) {
- WHITESPACE:
+ switch(env->current_char) {
+ case '\'':
+ retrieve_state(env);
+ token_end_position(env);
+ report_token(SYSTEM_LITERAL_TOKEN, env);
+ next_char(env);
+ return NO_ERROR;
+ LINE_ENDINGS
+ default:
+ if((FAXPP_char_flags(env->current_char) & env->non_restricted_char) == 0) {
+ next_char(env);
+ return RESTRICTED_CHAR;
+ }
+ break;
+ }
next_char(env);
- break;
- case '>':
- base_state(env);
- report_empty_token(DOCTYPE_END_TOKEN, env);
+ }
+
+ // Never happens
+ return NO_ERROR;
+}
+
+FAXPP_Error
+system_literal_quot_state(FAXPP_TokenizerEnv *env)
+{
+ while(1) {
+ read_char(env);
+
+ switch(env->current_char) {
+ case '"':
+ retrieve_state(env);
+ token_end_position(env);
+ report_token(SYSTEM_LITERAL_TOKEN, env);
+ next_char(env);
+ return NO_ERROR;
+ LINE_ENDINGS
+ default:
+ if((FAXPP_char_flags(env->current_char) & env->non_restricted_char) == 0) {
+ next_char(env);
+ return RESTRICTED_CHAR;
+ }
+ break;
+ }
next_char(env);
- token_start_position(env);
- break;
- default:
- next_char(env);
- return INVALID_DOCTYPE_DECL;
}
+
+ // Never happens
return NO_ERROR;
}
-SINGLE_CHAR_STATE(system_id_initial_state1, 'Y', 0, system_id_initial_state2, INVALID_SYSTEM_ID)
-SINGLE_CHAR_STATE(system_id_initial_state2, 'S', 0, system_id_initial_state3, INVALID_SYSTEM_ID)
-SINGLE_CHAR_STATE(system_id_initial_state3, 'T', 0, system_id_initial_state4, INVALID_SYSTEM_ID)
-SINGLE_CHAR_STATE(system_id_initial_state4, 'E', 0, system_id_initial_state5, INVALID_SYSTEM_ID)
-SINGLE_CHAR_STATE(system_id_initial_state5, 'M', 0, system_id_ws_state, INVALID_SYSTEM_ID)
+SINGLE_CHAR_STATE(public_id_initial_state1, 'U', 0, public_id_initial_state2, INVALID_PUBLIC_ID)
+SINGLE_CHAR_STATE(public_id_initial_state2, 'B', 0, public_id_initial_state3, INVALID_PUBLIC_ID)
+SINGLE_CHAR_STATE(public_id_initial_state3, 'L', 0, public_id_initial_state4, INVALID_PUBLIC_ID)
+SINGLE_CHAR_STATE(public_id_initial_state4, 'I', 0, public_id_initial_state5, INVALID_PUBLIC_ID)
+SINGLE_CHAR_STATE(public_id_initial_state5, 'C', 0, public_id_ws_state, INVALID_PUBLIC_ID)
FAXPP_Error
-system_id_ws_state(FAXPP_TokenizerEnv *env)
+public_id_ws_state(FAXPP_TokenizerEnv *env)
{
read_char(env);
switch(env->current_char) {
WHITESPACE:
- env->state = system_literal_start_state;
+ env->state = pubid_literal_start_state;
next_char(env);
break;
default:
@@ -309,7 +350,7 @@
}
FAXPP_Error
-system_literal_start_state(FAXPP_TokenizerEnv *env)
+pubid_literal_start_state(FAXPP_TokenizerEnv *env)
{
read_char(env);
@@ -318,14 +359,14 @@
next_char(env);
return NO_ERROR;
case '"':
- env->state = system_literal_quot_state;
+ env->state = pubid_literal_quot_state;
break;
case '\'':
- env->state = system_literal_apos_state;
+ env->state = pubid_literal_apos_state;
break;
default:
next_char(env);
- return EXPECTING_SYSTEM_LITERAL;
+ return EXPECTING_PUBID_LITERAL;
}
next_char(env);
token_start_position(env);
@@ -333,25 +374,38 @@
}
FAXPP_Error
-system_literal_apos_state(FAXPP_TokenizerEnv *env)
+pubid_literal_apos_state(FAXPP_TokenizerEnv *env)
{
while(1) {
read_char(env);
switch(env->current_char) {
case '\'':
- retrieve_state(env);
+ env->state = system_id_ws_state;
token_end_position(env);
- report_token(SYSTEM_LITERAL_TOKEN, env);
+ report_token(PUBID_LITERAL_TOKEN, env);
next_char(env);
return NO_ERROR;
+ // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
LINE_ENDINGS
+ // A-Z
+ case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
+ case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: case 0x4D: case 0x4E: case 0x4F:
+ case 0x50: case 0x51: case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: case 0x57:
+ case 0x58: case 0x59: case 0x5A:
+ // a-z
+ case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67:
+ case 0x68: case 0x69: case 0x6A: case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:
+ case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77:
+ case 0x78: case 0x79: case 0x7A:
+ case ' ': case '-': case '(': case ')': case '+': case ',': case '.': case '/': case ':':
+ case '=': case '?': case ';': case '!': case '*': case '#': case '@': case '$': case '_':
+ case '%':
+ // Valid PubidChar
+ break;
default:
- if((FAXPP_char_flags(env->current_char) & env->non_restricted_char) == 0) {
- next_char(env);
- return RESTRICTED_CHAR;
- }
- break;
+ next_char(env);
+ return INVALID_CHAR_IN_PUBID_LITERAL;
}
next_char(env);
}
@@ -361,25 +415,38 @@
}
FAXPP_Error
-system_literal_quot_state(FAXPP_TokenizerEnv *env)
+pubid_literal_quot_state(FAXPP_TokenizerEnv *env)
{
while(1) {
read_char(env);
switch(env->current_char) {
case '"':
- retrieve_state(env);
+ env->state = system_id_ws_state;
token_end_position(env);
- report_token(SYSTEM_LITERAL_TOKEN, env);
+ report_token(PUBID_LITERAL_TOKEN, env);
next_char(env);
return NO_ERROR;
+ // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
LINE_ENDINGS
+ // A-Z
+ case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
+ case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: case 0x4D: case 0x4E: case 0x4F:
+ case 0x50: case 0x51: case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: case 0x57:
+ case 0x58: case 0x59: case 0x5A:
+ // a-z
+ case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67:
+ case 0x68: case 0x69: case 0x6A: case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:
+ case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77:
+ case 0x78: case 0x79: case 0x7A:
+ case ' ': case '-': case '(': case ')': case '+': case ',': case '.': case '/': case ':':
+ case '=': case '?': case ';': case '!': case '*': case '#': case '@': case '$': case '_':
+ case '%': case '\'':
+ // Valid PubidChar
+ break;
default:
- if((FAXPP_char_flags(env->current_char) & env->non_restricted_char) == 0) {
- next_char(env);
- return RESTRICTED_CHAR;
- }
- break;
+ next_char(env);
+ return INVALID_CHAR_IN_PUBID_LITERAL;
}
next_char(env);
}
@@ -388,3 +455,99 @@
return NO_ERROR;
}
+FAXPP_Error
+doctype_internal_subset_start_state(FAXPP_TokenizerEnv *env)
+{
+ read_char(env);
+
+ switch(env->current_char) {
+ WHITESPACE:
+ break;
+ case '[':
+ env->state = internal_subset_state;
+ env->in_internal_subset = 1;
+ break;
+ case '>':
+ base_state(env);
+ report_empty_token(DOCTYPE_END_TOKEN, env);
+ next_char(env);
+ token_start_position(env);
+ return NO_ERROR;
+ default:
+ next_char(env);
+ return INVALID_DOCTYPE_DECL;
+ }
+ next_char(env);
+ return NO_ERROR;
+}
+
+FAXPP_Error
+internal_subset_state(FAXPP_TokenizerEnv *env)
+{
+ read_char(env);
+
+ switch(env->current_char) {
+ case ']':
+ env->state = doctype_end_state;
+ env->in_internal_subset = 0;
+ break;
+ WHITESPACE:
+ break;
+ case '<':
+ env->state = internal_subset_markup_state;
+ break;
+ default:
+ next_char(env);
+ return INVALID_DOCTYPE_DECL;
+ }
+
+ next_char(env);
+ return NO_ERROR;
+}
+
+FAXPP_Error
+internal_subset_markup_state(FAXPP_TokenizerEnv *env)
+{
+ read_char(env);
+
+ switch(env->current_char) {
+ case '?':
+ env->state = pi_name_start_state;
+ break;
+ case '!':
+ env->state = comment_start_state1;
+ break;
+ LINE_ENDINGS
+ default:
+/* env->state = internal_subset_decl_state; */
+ next_char(env);
+ return INVALID_DTD_DECL;
+ }
+
+ next_char(env);
+ token_start_position(env);
+ return NO_ERROR;
+}
+
+FAXPP_Error
+doctype_end_state(FAXPP_TokenizerEnv *env)
+{
+ read_char(env);
+
+ switch(env->current_char) {
+ WHITESPACE:
+ next_char(env);
+ break;
+ case '>':
+ base_state(env);
+ report_empty_token(DOCTYPE_END_TOKEN, env);
+ next_char(env);
+ token_start_position(env);
+ break;
+ default:
+ next_char(env);
+ return INVALID_DOCTYPE_DECL;
+ }
+ return NO_ERROR;
+}
+
Modified: trunk/faxpp/src/error.c
===================================================================
--- trunk/faxpp/src/error.c 2007-09-25 11:50:47 UTC (rev 32)
+++ trunk/faxpp/src/error.c 2007-09-28 00:29:16 UTC (rev 33)
@@ -91,6 +91,14 @@
return "INVALID_SYSTEM_ID";
case EXPECTING_SYSTEM_LITERAL:
return "EXPECTING_SYSTEM_LITERAL";
+ case INVALID_PUBLIC_ID:
+ return "INVALID_PUBLIC_ID";
+ case INVALID_CHAR_IN_PUBID_LITERAL:
+ return "INVALID_CHAR_IN_PUBID_LITERAL";
+ case EXPECTING_PUBID_LITERAL:
+ return "EXPECTING_PUBID_LITERAL";
+ case INVALID_DTD_DECL:
+ return "INVALID_DTD_DECL";
case NO_ERROR:
break;
}
Modified: trunk/faxpp/src/token.c
===================================================================
--- trunk/faxpp/src/token.c 2007-09-25 11:50:47 UTC (rev 32)
+++ trunk/faxpp/src/token.c 2007-09-28 00:29:16 UTC (rev 33)
@@ -92,6 +92,8 @@
case SYSTEM_LITERAL_TOKEN:
return "SYSTEM_LITERAL_TOKEN";
+ case PUBID_LITERAL_TOKEN:
+ return "PUBID_LITERAL_TOKEN";
case NO_TOKEN:
break;
Modified: trunk/faxpp/src/tokenizer_states.c
===================================================================
--- trunk/faxpp/src/tokenizer_states.c 2007-09-25 11:50:47 UTC (rev 32)
+++ trunk/faxpp/src/tokenizer_states.c 2007-09-28 00:29:16 UTC (rev 33)
@@ -411,8 +411,10 @@
return "doctype_after_name_state";
else if(state == doctype_internal_subset_start_state)
return "doctype_internal_subset_start_state";
- else if(state == doctype_internal_subset_state)
- return "doctype_internal_subset_state";
+ else if(state == internal_subset_state)
+ return "internal_subset_state";
+ else if(state == internal_subset_markup_state)
+ return "internal_subset_markup_state";
else if(state == doctype_end_state)
return "doctype_end_state";
@@ -435,6 +437,25 @@
else if(state == system_literal_quot_state)
return "system_literal_quot_state";
+ else if(state == public_id_initial_state1)
+ return "public_id_initial_state1";
+ else if(state == public_id_initial_state2)
+ return "public_id_initial_state2";
+ else if(state == public_id_initial_state3)
+ return "public_id_initial_state3";
+ else if(state == public_id_initial_state4)
+ return "public_id_initial_state4";
+ else if(state == public_id_initial_state5)
+ return "public_id_initial_state5";
+ else if(state == public_id_ws_state)
+ return "public_id_ws_state";
+ else if(state == pubid_literal_start_state)
+ return "pubid_literal_start_state";
+ else if(state == pubid_literal_apos_state)
+ return "pubid_literal_apos_state";
+ else if(state == pubid_literal_quot_state)
+ return "pubid_literal_quot_state";
+
return "unknown";
}
#endif
Modified: trunk/faxpp/src/tokenizer_states.h
===================================================================
--- trunk/faxpp/src/tokenizer_states.h 2007-09-25 11:50:47 UTC (rev 32)
+++ trunk/faxpp/src/tokenizer_states.h 2007-09-28 00:29:16 UTC (rev 33)
@@ -253,7 +253,8 @@
FAXPP_Error doctype_name_seen_colon_state2(FAXPP_TokenizerEnv *env);
FAXPP_Error doctype_after_name_state(FAXPP_TokenizerEnv *env);
FAXPP_Error doctype_internal_subset_start_state(FAXPP_TokenizerEnv *env);
-FAXPP_Error doctype_internal_subset_state(FAXPP_TokenizerEnv *env);
+FAXPP_Error internal_subset_state(FAXPP_TokenizerEnv *env);
+FAXPP_Error internal_subset_markup_state(FAXPP_TokenizerEnv *env);
FAXPP_Error doctype_end_state(FAXPP_TokenizerEnv *env);
FAXPP_Error system_id_initial_state1(FAXPP_TokenizerEnv *env);
@@ -266,7 +267,17 @@
FAXPP_Error system_literal_apos_state(FAXPP_TokenizerEnv *env);
FAXPP_Error system_literal_quot_state(FAXPP_TokenizerEnv *env);
+FAXPP_Error public_id_initial_state1(FAXPP_TokenizerEnv *env);
+FAXPP_Error public_id_initial_state2(FAXPP_TokenizerEnv *env);
+FAXPP_Error public_id_initial_state3(FAXPP_TokenizerEnv *env);
+FAXPP_Error public_id_initial_state4(FAXPP_TokenizerEnv *env);
+FAXPP_Error public_id_initial_state5(FAXPP_TokenizerEnv *env);
+FAXPP_Error public_id_ws_state(FAXPP_TokenizerEnv *env);
+FAXPP_Error pubid_literal_start_state(FAXPP_TokenizerEnv *env);
+FAXPP_Error pubid_literal_apos_state(FAXPP_TokenizerEnv *env);
+FAXPP_Error pubid_literal_quot_state(FAXPP_TokenizerEnv *env);
+
/*********************
*
* Tokenizer Helper Functions
@@ -340,6 +351,8 @@
if((env)->nesting_level == 0) \
if((env)->seen_doc_element) \
(env)->state = final_state; \
+ else if((env)->in_internal_subset) \
+ (env)->state = internal_subset_state; \
else (env)->state = initial_misc_state; \
else (env)->state = (env)->element_content_state; \
}
Modified: trunk/faxpp/src/xml_parser.c
===================================================================
--- trunk/faxpp/src/xml_parser.c 2007-09-25 11:50:47 UTC (rev 32)
+++ trunk/faxpp/src/xml_parser.c 2007-09-28 00:29:16 UTC (rev 33)
@@ -814,6 +814,7 @@
case DOCTYPE_NAME_TOKEN:
case DOCTYPE_END_TOKEN:
case SYSTEM_LITERAL_TOKEN:
+ case PUBID_LITERAL_TOKEN:
// TBD - jpcs
break;
Modified: trunk/faxpp/src/xml_tokenizer.c
===================================================================
--- trunk/faxpp/src/xml_tokenizer.c 2007-09-25 11:50:47 UTC (rev 32)
+++ trunk/faxpp/src/xml_tokenizer.c 2007-09-28 00:29:16 UTC (rev 33)
@@ -389,6 +389,7 @@
env->nesting_level = 0;
env->do_encode = 1;
env->seen_doctype = 0;
+ env->in_internal_subset = 0;
env->seen_doc_element = 0;
env->buffer_done = done;
Modified: trunk/faxpp/src/xml_tokenizer.h
===================================================================
--- trunk/faxpp/src/xml_tokenizer.h 2007-09-25 11:50:47 UTC (rev 32)
+++ trunk/faxpp/src/xml_tokenizer.h 2007-09-28 00:29:16 UTC (rev 33)
@@ -44,6 +44,7 @@
unsigned int nesting_level;
unsigned int do_encode:1;
unsigned int seen_doctype:1;
+ unsigned int in_internal_subset:1;
unsigned int seen_doc_element:1;
unsigned int buffer_done:1;
unsigned int normalize_attrs:1;
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|